A genômica viral é uma das áreas mais ativas em bioinformática. Vigilância baseada em sequenciamento, investigação de surtos, monitoramento de resistência a medicamentos e análise de alvos de vacinas requerem as mesmas habilidades centrais: alinhar sequências, identificar cepas e colocar sequências em contexto evolutivo. Este capítulo coloca essas habilidades para trabalhar com dados reais de sequências virais.
Configuração
pip install biopython matplotlib pandas numpy requests
# For local BLAST (optional but recommended for large datasets):
# conda install -c bioconda blast
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import Align
import pandas as pd
import numpy as np
import requests
import time
Parte 1: Identificando uma Sequência Desconhecida com BLAST
Uma tarefa comum: você tem uma sequência de origem desconhecida (de uma amostra de paciente, amostra ambiental ou organismo novo) e precisa identificá-la.
# An unknown sequence — let's see what BLAST says
unknown_seq = """
ATGAGAGCCTTGTCCTTTATAGCACAAATGGCAACCAAATGCATCAAAATGAGAGAATCAGTCTCAATGG
ATGGAAATGAATAAGCCTGTCAAGCTGGATGGAAACAACATTACAACAGAAGATCAAATGGGGAAGATGG
AGAAAGAAGTGGTCATCAGAAGCAACAGCTTATCATCAGCAGCAGCCATCAGCAGGATAGAAGCGGCAGC
""".replace("\n", "").strip()
Entrez.email = "your@email.com"
print(f"Query length: {len(unknown_seq)} bp")
print("Running BLAST against NT database...")
result_handle = NCBIWWW.qblast(
"blastn",
"nt",
unknown_seq,
hitlist_size=10,
format_type="XML"
)
# Save result to avoid re-running
with open("blast_result.xml", "w") as f:
f.write(result_handle.read())
result_handle.close()
print("BLAST complete.")
Analisando os Resultados do BLAST
def parse_blast_results(xml_file: str) -> pd.DataFrame:
with open(xml_file) as f:
blast_record = next(NCBIXML.parse(f))
rows = []
for alignment in blast_record.alignments:
for hsp in alignment.hsps[:1]: # best HSP per hit
rows.append({
"title": alignment.title[:80],
"accession": alignment.accession,
"score": hsp.score,
"e_value": hsp.expect,
"identity_pct": 100 * hsp.identities / hsp.align_length,
"alignment_length": hsp.align_length,
"query_coverage_pct": 100 * (hsp.query_end - hsp.query_start + 1) / blast_record.query_length,
})
return pd.DataFrame(rows)
df = parse_blast_results("blast_result.xml")
print(df[["title", "identity_pct", "e_value", "query_coverage_pct"]].to_string(index=False))
Interpretando Resultados
def classify_blast_hit(identity: float, coverage: float) -> str:
"""Classify the BLAST hit quality."""
if identity >= 99 and coverage >= 95:
return "Near-identical match (same strain or minor variant)"
elif identity >= 95 and coverage >= 90:
return "High confidence match (closely related strain)"
elif identity >= 80 and coverage >= 70:
return "Moderate match (same species, different strain)"
elif identity >= 70:
return "Distant homolog (same genus or family)"
else:
return "Weak match — possibly novel or contamination"
for _, row in df.head(5).iterrows():
classification = classify_blast_hit(row["identity_pct"], row["query_coverage_pct"])
print(f"\n{row['title'][:60]}")
print(f" Identity: {row['identity_pct']:.1f}%, Coverage: {row['query_coverage_pct']:.1f}%")
print(f" {classification}")
Parte 2: Buscando e Comparando Cepas de Influenza
Rastreie a evolução da hemaglutinina (HA) da influenza ao longo dos anos.
def fetch_influenza_ha_sequences(strains: list[str]) -> list:
"""Fetch HA sequences for specified influenza strains."""
records = []
for strain_query in strains:
handle = Entrez.esearch(
db="nucleotide",
term=f"{strain_query}[Title] AND hemagglutinin[Title] AND complete[Title] AND influenza[Organism]",
retmax=1
)
result = Entrez.read(handle)
handle.close()
if not result["IdList"]:
print(f"No results for: {strain_query}")
continue
time.sleep(0.4)
handle = Entrez.efetch(
db="nucleotide",
id=result["IdList"][0],
rettype="fasta",
retmode="text"
)
record = SeqIO.read(handle, "fasta")
record.description = strain_query # use strain name as description
records.append(record)
handle.close()
print(f"Fetched: {strain_query} ({len(record.seq)} bp)")
time.sleep(0.4)
return records
# A subset of influenza H3N2 strains from different years
strains_to_fetch = [
"A/Hong Kong/1/1968 H3N2",
"A/Sydney/5/1997 H3N2",
"A/Wisconsin/67/2005 H3N2",
"A/Perth/16/2009 H3N2",
]
ha_records = fetch_influenza_ha_sequences(strains_to_fetch)
SeqIO.write(ha_records, "influenza_ha.fasta", "fasta")
print(f"\nSaved {len(ha_records)} HA sequences")
Parte 3: Alinhamento de Sequências por Pares
Compare sequências para medir a divergência evolutiva:
from Bio import Align
from Bio.Seq import Seq
def pairwise_identity(seq1: str, seq2: str) -> float:
"""Compute percent identity using pairwise global alignment."""
aligner = Align.PairwiseAligner()
aligner.mode = "global"
aligner.match_score = 1
aligner.mismatch_score = -1
aligner.open_gap_score = -2
aligner.extend_gap_score = -0.5
alignment = aligner.align(seq1, seq2)[0]
matches = sum(a == b for a, b in zip(alignment[0], alignment[1]) if a != "-" and b != "-")
aligned_length = sum(1 for a, b in zip(alignment[0], alignment[1]) if a != "-" or b != "-")
return 100 * matches / aligned_length if aligned_length > 0 else 0
# Build identity matrix
n = len(ha_records)
identity_matrix = np.zeros((n, n))
strain_labels = [r.description for r in ha_records]
for i in range(n):
for j in range(i, n):
if i == j:
identity_matrix[i][j] = 100.0
else:
idt = pairwise_identity(str(ha_records[i].seq[:500]), str(ha_records[j].seq[:500]))
identity_matrix[i][j] = identity_matrix[j][i] = idt
# Display as a heatmap-ready DataFrame
df_identity = pd.DataFrame(identity_matrix, index=strain_labels, columns=strain_labels)
print("\nSequence identity matrix (%):")
print(df_identity.round(1))
Parte 4: Análise de Mutações (Spike do SARS-CoV-2)
Analise mutações no domínio de ligação ao receptor da proteína spike em variantes:
REFERENCE_SPIKE_FRAGMENT = (
"QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGT"
"NGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEF"
"QFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFV"
)
def find_mutations(ref_seq: str, query_seq: str, ref_start: int = 1) -> list:
"""Find amino acid mutations between two sequences (already aligned)."""
mutations = []
for i, (ref_aa, qry_aa) in enumerate(zip(ref_seq, query_seq)):
if ref_aa != qry_aa and ref_aa != "-" and qry_aa != "-":
pos = ref_start + i
mutations.append(f"{ref_aa}{pos}{qry_aa}")
return mutations
# Known VOC spike protein mutations (simplified representation)
variant_mutations = {
"Alpha (B.1.1.7)": ["N501Y", "D614G", "P681H"],
"Beta (B.1.351)": ["K417N", "E484K", "N501Y", "D614G"],
"Delta (B.1.617.2)": ["L452R", "T478K", "D614G", "P681R"],
"Omicron BA.1": ["G339D", "S371L", "S373P", "S375F", "K417N", "N440K",
"G446S", "S477N", "T478K", "E484A", "Q493R", "G496S",
"Q498R", "N501Y", "Y505H", "D614G", "H655Y", "N679K",
"P681H", "N764K", "D796Y", "N856K", "Q954H", "N969K"],
}
# Analyze which mutations affect the RBD (receptor binding domain) positions 319-541
RBD_START, RBD_END = 319, 541
def is_rbd_mutation(mutation: str) -> bool:
"""Check if mutation is in the receptor-binding domain."""
pos = int("".join(c for c in mutation[1:-1] if c.isdigit()))
return RBD_START <= pos <= RBD_END
print("Variant Mutation Analysis:")
print(f"{'Variant':<25} {'Total mutations':>15} {'RBD mutations':>15} {'RBD rate':>12}")
print("-" * 70)
for variant, mutations in variant_mutations.items():
rbd_muts = [m for m in mutations if is_rbd_mutation(m)]
rate = len(rbd_muts) / len(mutations) if mutations else 0
print(f"{variant:<25} {len(mutations):>15} {len(rbd_muts):>15} {rate:>11.0%}")
print(f"\nOmicron RBD mutations: {[m for m in variant_mutations['Omicron BA.1'] if is_rbd_mutation(m)]}")
Parte 5: Construindo uma Árvore Filogenética Simples
Use distâncias de sequências para construir uma árvore neighbor-joining:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
def distance_matrix_to_upgma_tree(labels: list, dist_matrix: np.ndarray) -> dict:
"""
Simplified UPGMA clustering.
Returns a nested dict representing the tree structure.
"""
# Convert to distance (0 = identical, 1 = maximally different)
d = (100 - dist_matrix) / 100
nodes = {i: labels[i] for i in range(len(labels))}
heights = {i: 0.0 for i in range(len(labels))}
tree = {labels[i]: None for i in range(len(labels))}
current_d = d.copy()
current_nodes = list(range(len(labels)))
while len(current_nodes) > 1:
# Find minimum distance pair
min_dist = np.inf
min_i, min_j = -1, -1
for i in range(len(current_nodes)):
for j in range(i + 1, len(current_nodes)):
ni, nj = current_nodes[i], current_nodes[j]
if current_d[ni][nj] < min_dist:
min_dist = current_d[ni][nj]
min_i, min_j = i, j
ni, nj = current_nodes[min_i], current_nodes[min_j]
new_label = f"({nodes[ni]}, {nodes[nj]})"
nodes[len(nodes)] = new_label
print(f" Merge: {nodes[ni]} + {nodes[nj]} at distance {min_dist:.3f}")
current_nodes.pop(max(min_i, min_j))
current_nodes.pop(min(min_i, min_j))
current_nodes.append(len(nodes) - 1)
return nodes
print("\nBuilding phylogenetic tree:")
tree = distance_matrix_to_upgma_tree(strain_labels, identity_matrix)
Visualização com um Dendrograma Simples
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
# Convert identity to distance
dist_array = (100 - identity_matrix) / 100
np.fill_diagonal(dist_array, 0)
condensed = squareform(dist_array)
# UPGMA linkage
Z = linkage(condensed, method="average")
fig, ax = plt.subplots(figsize=(10, 5))
dendrogram(
Z,
labels=strain_labels,
orientation="top",
leaf_rotation=45,
ax=ax
)
ax.set_xlabel("Sequence Distance")
ax.set_ylabel("Branch length")
ax.set_title("Influenza HA Phylogenetic Tree (UPGMA)")
plt.tight_layout()
plt.savefig("influenza_tree.png", dpi=150)
plt.show()
Parte 6: Pipeline Prático de Vigilância do SARS-CoV-2
Uma versão simplificada do que os laboratórios de vigilância genômica fazem:
def sars_cov2_surveillance_summary(accessions: list[str]) -> pd.DataFrame:
"""
For each accession: fetch sequence, check for key spike mutations,
and assign to variant based on mutation profile.
"""
# Key positions to check (in spike protein)
SIGNATURE_MUTATIONS = {
"D614G": (614, "D", "G"),
"N501Y": (501, "N", "Y"),
"E484K": (484, "E", "K"),
"E484A": (484, "E", "A"),
"L452R": (452, "L", "R"),
"P681R": (681, "P", "R"),
"P681H": (681, "P", "H"),
}
VARIANT_SIGNATURES = {
"Omicron": {"N501Y", "E484A", "D614G"},
"Delta": {"L452R", "D614G", "P681R"},
"Beta": {"E484K", "N501Y", "D614G"},
"Alpha": {"N501Y", "D614G", "P681H"},
"Wuhan-like": set(),
}
results = []
for acc in accessions:
time.sleep(0.4)
try:
handle = Entrez.efetch(db="nucleotide", id=acc, rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
# This is simplified — real pipelines align to reference
result = {
"accession": acc,
"length": len(record.seq),
"country": "Unknown",
"date": "Unknown",
}
# Extract metadata from qualifiers
for feature in record.features:
if feature.type == "source":
result["country"] = feature.qualifiers.get("country", ["Unknown"])[0]
result["date"] = feature.qualifiers.get("collection_date", ["Unknown"])[0]
results.append(result)
except Exception as e:
results.append({"accession": acc, "error": str(e)})
return pd.DataFrame(results)
# Example accessions (real SARS-CoV-2 sequences — replace with actual accession numbers)
example_accessions = ["MN908947"] # Original Wuhan reference genome
df_summary = sars_cov2_surveillance_summary(example_accessions)
print(df_summary)
Resumo: Padrões do Fluxo de Trabalho BLAST
| Tarefa | Ferramenta/Método |
|---|---|
| Identificar sequência desconhecida | BLAST contra NT/NR |
| Comparar cepas | Alinhamento por pares, matriz de identidade |
| Rastrear evolução viral | Buscar sequências por ano/cepa, construir árvore filogenética |
| Detectar mutações de resistência | Comparar com referência, anotar posições |
| Vigilância | Buscar sequências por país/data, caracterizar mutações |
A versão industrial completa deste fluxo de trabalho (usada na vigilância real da COVID) roda bancos de dados em escala GISAID através do NextStrain, usa pipelines augur/Nextstrain para posicionamento filogenético e envia resultados para uma visualização filogenética em tempo real em nextstrain.org. A lógica subjacente é exatamente o que abordamos aqui — a escala é diferente, a biologia é a mesma.
Para SARS-CoV-2, o GISAID (gisaid.org) é o repositório principal — mais de 15 milhões de sequências depositadas. O acesso requer registro e concordância com os termos de compartilhamento de dados.
O portal Virus do NCBI (ncbi.nlm.nih.gov/labs/virus) fornece uma interface web e API para sequências do SARS-CoV-2 e outros vírus do GenBank, com filtragem por país, data, hospedeiro e completude do genoma.
Para influenza, o Banco de Dados de Influenza do NCBI (ncbi.nlm.nih.gov/genomes/FLU) fornece sequências de genoma completo com rótulos de segmento padronizados.