Na Prática: Arquivos VCF com Python

Arquivos VCF (Variant Call Format) são o formato universal para dados de variantes genômicas — a saída de cada chamador de variantes, a entrada para cada pipeline de anotação e o tipo de dado central na genômica clínica e biologia do câncer. Todo praticante de bioinformática que trabalha com dados genômicos passará um tempo significativo manipulando arquivos VCF.

Este capítulo abrange as habilidades práticas: análise de VCFs, filtragem de variantes, computação de estatísticas, trabalho com frequências do gnomAD e realização de análise básica de assinatura mutacional.

Configuração

bash

pip install cyvcf2 pandas numpy matplotlib seaborn requests
# cyvcf2 is faster than PyVCF for large files; PyVCF is also fine for learning

python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from collections import Counter, defaultdict

O Formato VCF em Detalhe

Antes de escrever código, entenda a estrutura:

##fileformat=VCFv4.2                        ← format version
##reference=GRCh38                          ← reference genome
##FILTER=<ID=PASS,Description="...">        ← filter definitions
##INFO=<ID=AF,Number=A,Type=Float,...>      ← INFO field definitions
##FORMAT=<ID=GT,Number=1,Type=String,...>   ← FORMAT field definitions
#CHROM  POS   ID    REF  ALT  QUAL  FILTER  INFO              FORMAT  SAMPLE
chr17   7674220  .  G   T   100   PASS   AF=0.001;DP=50   GT:AD:DP  0/1:25,20:45

Conceitos-chave:

POS é baseado em 1 (não baseado em 0 como strings Python)
REF/ALT descrevem a mudança naquela posição
INFO contém anotações por variante (pares chave=valor delimitados por ;)
FORMAT especifica os campos por amostra; as colunas SAMPLE contêm valores nessa ordem
GT=0/1 significa heterozigoto (um alelo ref, um alt); 0/0=homref; 1/1=homalt

Analisando Arquivos VCF

Usando cyvcf2 (Recomendado para Arquivos Grandes)

python

import cyvcf2

def parse_vcf(vcf_path: str) -> list[dict]:
    """Parse a VCF file into a list of variant dictionaries."""
    variants = []
    vcf = cyvcf2.VCF(vcf_path)

    for variant in vcf:
        record = {
            "chrom": variant.CHROM,
            "pos": variant.POS,
            "ref": variant.REF,
            "alt": ",".join(variant.ALT),
            "qual": variant.QUAL,
            "filter": list(variant.FILTER),
            "id": variant.ID or ".",
        }

        # INFO fields
        for key in variant.INFO:
            record[f"info_{key}"] = variant.INFO.get(key)

        # Per-sample genotypes
        for i, sample in enumerate(vcf.samples):
            gt = variant.genotypes[i]
            record[f"gt_{sample}"] = f"{gt[0]}/{gt[1]}"
            # AD = allele depths, DP = depth
            ad = variant.format("AD")
            if ad is not None:
                record[f"ad_{sample}"] = list(ad[i])

        variants.append(record)

    vcf.close()
    return variants

Analisador Simples (Sem Dependências)

Para aprendizado ou arquivos pequenos:

python

def parse_vcf_simple(vcf_path: str) -> tuple[list, list]:
    """Returns (header_lines, variant_rows)."""
    header_lines = []
    column_headers = []
    variants = []

    with open(vcf_path) as f:
        for line in f:
            line = line.rstrip()
            if line.startswith("##"):
                header_lines.append(line)
            elif line.startswith("#CHROM"):
                column_headers = line[1:].split("\t")  # remove leading #
            else:
                fields = line.split("\t")
                record = dict(zip(column_headers, fields))
                variants.append(record)

    return header_lines, column_headers, variants

def parse_info_field(info_str: str) -> dict:
    """Parse semicolon-delimited INFO field into a dict."""
    result = {}
    if info_str == ".":
        return result
    for item in info_str.split(";"):
        if "=" in item:
            key, val = item.split("=", 1)
            result[key] = val
        else:
            result[item] = True  # flag field
    return result

def parse_genotype(format_str: str, sample_str: str) -> dict:
    """Parse FORMAT + SAMPLE fields."""
    keys = format_str.split(":")
    values = sample_str.split(":")
    return dict(zip(keys, values))

Trabalhando com um DataFrame VCF

Converter para pandas torna a filtragem e análise mais fáceis:

python

def vcf_to_dataframe(vcf_path: str) -> pd.DataFrame:
    """Parse VCF to a pandas DataFrame."""
    rows = []
    with open(vcf_path) as f:
        headers = []
        samples = []
        for line in f:
            line = line.rstrip()
            if line.startswith("##"):
                continue
            elif line.startswith("#CHROM"):
                headers = line[1:].split("\t")
                samples = headers[9:]  # samples start at column 9
            else:
                fields = line.split("\t")
                row = dict(zip(headers[:9], fields[:9]))

                # Parse INFO
                info = parse_info_field(row.get("INFO", "."))
                row.update({f"info_{k}": v for k, v in info.items()})

                # Parse per-sample FORMAT
                fmt = row.get("FORMAT", "")
                for i, sample in enumerate(samples):
                    if 9 + i < len(fields):
                        gt_data = parse_genotype(fmt, fields[9 + i])
                        row.update({f"{sample}_{k}": v for k, v in gt_data.items()})

                rows.append(row)

    return pd.DataFrame(rows)

Filtrando Variantes

Filtros de qualidade padrão:

python

def filter_variants(df: pd.DataFrame, sample: str = "TUMOR") -> pd.DataFrame:
    filtered = df.copy()

    # 1. Only PASS variants
    filtered = filtered[filtered["FILTER"] == "PASS"]

    # 2. Minimum depth
    depth_col = f"{sample}_DP"
    if depth_col in filtered.columns:
        filtered[depth_col] = pd.to_numeric(filtered[depth_col], errors="coerce")
        filtered = filtered[filtered[depth_col] >= 20]

    # 3. Minimum variant allele frequency (VAF) for somatic calling
    ad_col = f"{sample}_AD"
    if ad_col in filtered.columns:
        def compute_vaf(ad_str):
            try:
                parts = str(ad_str).split(",")
                ref_count, alt_count = int(parts[0]), int(parts[1])
                total = ref_count + alt_count
                return alt_count / total if total > 0 else 0
            except:
                return 0

        filtered["vaf"] = filtered[ad_col].apply(compute_vaf)
        filtered = filtered[filtered["vaf"] >= 0.05]  # ≥5% VAF

    return filtered

print(f"Variants before filtering: {len(df)}")
df_filtered = filter_variants(df)
print(f"Variants after filtering: {len(df_filtered)}")

Computando Estatísticas de Variantes

python

def variant_statistics(df: pd.DataFrame) -> dict:
    """Compute a summary of variant types and metrics."""

    def classify_variant(row):
        ref = str(row["REF"])
        alts = str(row["ALT"]).split(",")
        alt = alts[0]  # take first alt allele

        if len(ref) == 1 and len(alt) == 1:
            return "SNV"
        elif len(ref) > len(alt):
            return "DEL"
        elif len(ref) < len(alt):
            return "INS"
        else:
            return "MNV"

    df = df.copy()
    df["variant_type"] = df.apply(classify_variant, axis=1)

    # Transition/Transversion ratio (Ti/Tv)
    def classify_substitution(row):
        if row["variant_type"] != "SNV":
            return None
        ref, alt = row["REF"], row["ALT"].split(",")[0]
        purines = {"A", "G"}
        if (ref in purines) == (alt in purines):
            return "transition"  # A↔G, C↔T
        else:
            return "transversion"  # A↔C, A↔T, G↔C, G↔T

    df["sub_type"] = df.apply(classify_substitution, axis=1)
    snvs = df[df["variant_type"] == "SNV"]
    ti = (snvs["sub_type"] == "transition").sum()
    tv = (snvs["sub_type"] == "transversion").sum()

    return {
        "total_variants": len(df),
        "variant_types": df["variant_type"].value_counts().to_dict(),
        "ti_count": int(ti),
        "tv_count": int(tv),
        "ti_tv_ratio": round(ti / tv, 3) if tv > 0 else float("inf"),
    }

stats = variant_statistics(df_filtered)
for key, val in stats.items():
    print(f"{key}: {val}")

ℹRazão Ti/Tv como métrica de qualidade

No genoma humano, a razão Ti/Tv esperada para exomas é ~2,8–3,5 (devido à desaminação de CpG preferencialmente causando transições). Para genomas inteiros é ~2,0–2,1. Uma razão significativamente abaixo de 2 sugere artefatos de chamada de variantes (falsos positivos tendem a ser aleatórios → ti/tv iguais). Uma razão bem acima de 3 em um exoma também vale a pena investigar. Ti/Tv é uma das primeiras verificações de QC em qualquer pipeline de chamada de variantes.

Análise do Espectro Mutacional

python

COMPLEMENT = {"A": "T", "T": "A", "C": "G", "G": "C"}

def reverse_complement(seq: str) -> str:
    return "".join(COMPLEMENT[b] for b in reversed(seq))

def normalize_mutation(ref: str, alt: str, context: str) -> tuple[str, str]:
    """Normalize mutation to pyrimidine reference (C or T)."""
    if ref in ("C", "T"):
        return f"{context[0]}[{ref}>{alt}]{context[2]}", ref
    else:  # ref is A or G — use complement
        ref_c = COMPLEMENT[ref]
        alt_c = COMPLEMENT[alt]
        ctx_c = reverse_complement(context)
        return f"{ctx_c[0]}[{ref_c}>{alt_c}]{ctx_c[2]}", ref_c

def compute_mutation_spectrum(
    df: pd.DataFrame,
    reference_fasta_path: str = None
) -> dict:
    """
    Compute the 96-trinucleotide mutation spectrum from SNVs.
    If reference_fasta_path is None, uses simplified context from position.
    """
    spectrum = defaultdict(int)

    snvs = df[df["ALT"].str.len() == 1][df["REF"].str.len() == 1].copy()

    for _, row in snvs.iterrows():
        ref = str(row["REF"]).upper()
        alt = str(row["ALT"]).split(",")[0].upper()

        if ref not in "ACGT" or alt not in "ACGT" or ref == alt:
            continue

        # Without reference, use simplified mutation type
        mut_type = f"{ref}>{alt}"
        spectrum[mut_type] += 1

    return dict(spectrum)

spectrum = compute_mutation_spectrum(df_filtered)
print("\nMutation spectrum:")
for mut_type, count in sorted(spectrum.items()):
    bar = "█" * (count // max(1, max(spectrum.values()) // 30))
    print(f"  {mut_type}: {count:5d} {bar}")

Visualizando o Espectro SBS

python

def plot_sbs_spectrum(spectrum: dict, title: str = "Mutation Spectrum"):
    """Plot a simplified SBS spectrum bar chart."""
    # Group by substitution type
    sub_types = ["C>A", "C>G", "C>T", "T>A", "T>C", "T>G"]
    colors = ["steelblue", "black", "red", "lightgray", "green", "pink"]

    fig, ax = plt.subplots(figsize=(10, 5))

    x_positions = range(len(sub_types))
    counts = [spectrum.get(s, 0) for s in sub_types]
    total = sum(counts)
    fractions = [c / total if total > 0 else 0 for c in counts]

    bars = ax.bar(x_positions, fractions, color=colors, edgecolor="white", linewidth=0.5)

    ax.set_xticks(x_positions)
    ax.set_xticklabels(sub_types, rotation=45, ha="right")
    ax.set_ylabel("Fraction of mutations")
    ax.set_title(title)
    ax.set_ylim(0, 1)

    for bar, count in zip(bars, counts):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                str(count), ha="center", va="bottom", fontsize=9)

    plt.tight_layout()
    plt.savefig("mutation_spectrum.png", dpi=150)
    plt.show()

plot_sbs_spectrum(spectrum)

Consultando o gnomAD para Frequências Populacionais

python

def query_gnomad_v4(chrom: str, pos: int, ref: str, alt: str) -> dict:
    """Query gnomAD v4 GraphQL API for variant allele frequency."""
    query = """
    query VariantDetails($variantId: String!) {
        variant(variantId: $variantId, dataset: gnomad_r4) {
            variantId
            exome {
                ac
                an
                af
                filters
            }
            genome {
                ac
                an
                af
                filters
            }
        }
    }
    """
    import requests

    # gnomAD variant ID format: chrom-pos-ref-alt
    variant_id = f"{chrom.replace('chr', '')}-{pos}-{ref}-{alt}"

    response = requests.post(
        "https://gnomad.broadinstitute.org/api",
        json={"query": query, "variables": {"variantId": variant_id}},
        headers={"Content-Type": "application/json"},
        timeout=10
    )

    if response.status_code != 200:
        return {"error": f"HTTP {response.status_code}"}

    data = response.json()
    if "errors" in data:
        return {"error": data["errors"]}

    variant_data = data.get("data", {}).get("variant", {})
    if not variant_data:
        return {"ac": 0, "an": 0, "af": 0, "found": False}

    exome = variant_data.get("exome") or {}
    genome = variant_data.get("genome") or {}

    return {
        "found": True,
        "exome_af": exome.get("af", 0),
        "exome_ac": exome.get("ac", 0),
        "genome_af": genome.get("af", 0),
        "genome_ac": genome.get("ac", 0),
        "combined_af": (exome.get("af", 0) or 0) + (genome.get("af", 0) or 0)
    }

# Example: look up a known variant
tp53_variant = query_gnomad_v4("chr17", 7674220, "G", "T")
print(f"TP53 R248W in gnomAD: {tp53_variant}")

Pipeline Completo de Análise

python

def analyze_tumor_vcf(vcf_path: str, sample_name: str = "TUMOR") -> dict:
    """Full pipeline: load VCF → filter → compute statistics → check gnomAD."""
    import time

    print(f"Loading {vcf_path}...")
    df = vcf_to_dataframe(vcf_path)
    print(f"  Loaded {len(df)} raw variants")

    df_filt = filter_variants(df, sample_name)
    print(f"  After filtering: {len(df_filt)} variants")

    stats = variant_statistics(df_filt)
    spectrum = compute_mutation_spectrum(df_filt)

    # Check gnomAD for top variants
    print("  Querying gnomAD for top variants...")
    gnomad_results = []
    for _, row in df_filt.head(10).iterrows():
        try:
            result = query_gnomad_v4(row["CHROM"], int(row["POS"]), row["REF"], row["ALT"])
            gnomad_results.append({
                "variant": f"{row['CHROM']}:{row['POS']} {row['REF']}>{row['ALT']}",
                "gnomad_af": result.get("combined_af", 0),
                "in_gnomad": result.get("found", False)
            })
            time.sleep(0.3)  # rate limiting
        except Exception as e:
            gnomad_results.append({"variant": f"{row['POS']}", "error": str(e)})

    return {
        "stats": stats,
        "spectrum": spectrum,
        "gnomad_check": gnomad_results
    }

Criando um VCF Sintético para Testes

python

def create_synthetic_vcf(output_path: str, n_variants: int = 100):
    """Create a synthetic VCF for testing pipelines."""
    import random
    random.seed(42)

    header = [
        "##fileformat=VCFv4.2",
        "##reference=GRCh38",
        "##FILTER=<ID=PASS,Description='All filters passed'>",
        "##FORMAT=<ID=GT,Number=1,Type=String,Description='Genotype'>",
        "##FORMAT=<ID=DP,Number=1,Type=Integer,Description='Total depth'>",
        "##FORMAT=<ID=AD,Number=R,Type=Integer,Description='Allele depths'>",
        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tTUMOR",
    ]

    bases = list("ACGT")
    chroms = [f"chr{i}" for i in range(1, 23)] + ["chrX"]

    with open(output_path, "w") as f:
        f.write("\n".join(header) + "\n")
        for _ in range(n_variants):
            chrom = random.choice(chroms)
            pos = random.randint(1000000, 200000000)
            ref = random.choice(bases)
            alt = random.choice([b for b in bases if b != ref])
            qual = random.randint(50, 200)
            depth = random.randint(30, 200)
            alt_count = random.randint(5, depth // 2)
            ref_count = depth - alt_count
            line = f"{chrom}\t{pos}\t.\t{ref}\t{alt}\t{qual}\tPASS\t.\tGT:DP:AD\t0/1:{depth}:{ref_count},{alt_count}"
            f.write(line + "\n")

    print(f"Created {output_path} with {n_variants} synthetic variants")

# Create test data
create_synthetic_vcf("test.vcf", 200)
df = vcf_to_dataframe("test.vcf")
df_filt = filter_variants(df)
stats = variant_statistics(df_filt)
print(f"\nTest VCF stats: {stats}")

Resumo

Tarefa	Ferramenta/Abordagem
Analisar VCF	cyvcf2 (rápido), PyVCF (Python puro) ou análise manual
Filtrar variantes	FILTER=PASS, profundidade, limites de VAF
Computar estatísticas	Razão Ti/Tv, distribuição de tipo de variante
Anotar	VEP, ANNOVAR, SnpEff (linha de comando)
Consultar freq. populacional	API GraphQL do gnomAD ou índice tabix local
Assinaturas mutacionais	SigProfilerExtractor, MutationalPatterns (R) ou personalizado

Para análise de variantes em larga escala (coortes de genoma inteiro), use ferramentas de linha de comando: bcftools para manipulação, VEP ou ANNOVAR para anotação, GATK ou DeepVariant para chamada de variantes. Scripts Python envolvem essas ferramentas e analisam suas saídas.