Biopython is the closest thing biology has to a standard library for sequence analysis. It wraps NCBI databases, parses dozens of file formats, implements classic algorithms, and provides data structures purpose-built for biological sequences. If you're writing Python code that touches sequences, you're almost certainly using Biopython.
This chapter covers the core workflows: working with sequences, parsing common file formats, computing sequence properties, aligning sequences, and a brief introduction to structure handling.
Installation and Setup
pip install biopython
The main namespace is Bio. Submodules are organized by function:
from Bio import SeqIO # File parsing and sequence I/O
from Bio.Seq import Seq # The core Seq object
from Bio import Entrez # NCBI database access
from Bio import Align # Pairwise and multiple alignment
from Bio import pairwise2 # Legacy pairwise alignment
from Bio.PDB import PDBParser # Structure parsing
The Seq Object
The foundational abstraction is Bio.Seq.Seq — a string-like object that knows it represents a biological sequence and supports biological operations.
from Bio.Seq import Seq
# Create a sequence
dna = Seq("ATGCGATCGATCGATCG")
# String-like operations work as expected
print(len(dna)) # 18
print(dna[0:6]) # ATGCGA
print(dna.count("ATG")) # 1
# Biology-specific operations
print(dna.complement()) # TACGCTAGCTAGCTAGC
print(dna.reverse_complement()) # CGATCGATCGATCGCAT
# Transcription: DNA → RNA (T → U)
rna = dna.transcribe()
print(rna) # AUGCGAUCGAUCGAUCG
# Translation: RNA → protein
protein = rna.translate()
print(protein) # MRSS* (where * is stop codon)
# Direct DNA → protein
protein = dna.translate()
print(protein) # MRSS*
print(protein[:-1]) # Remove stop codon: MRSS
Sequence Alphabets and Types
# By default, Seq is typeless — works for DNA, RNA, or protein
# For RNA sequences, use U instead of T
rna = Seq("AUGCGAUCG")
# MutableSeq allows in-place modification
from Bio.Seq import MutableSeq
mseq = MutableSeq("ATGCGATCG")
mseq[0] = "G" # point substitution
print(mseq) # GTGCGATCG
SeqRecord: Sequences with Metadata
The SeqRecord object combines a Seq with its identifier, name, description, and features.
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
record = SeqRecord(
Seq("ATGCGATCGATCG"),
id="NM_007294",
name="BRCA1",
description="Homo sapiens BRCA1 mRNA",
annotations={"molecule_type": "mRNA"}
)
print(record.id) # NM_007294
print(record.seq) # ATGCGATCGATCG
print(record.description) # Homo sapiens BRCA1 mRNA
File Parsing with SeqIO
Bio.SeqIO reads and writes all common biological sequence formats. The interface is consistent regardless of format:
from Bio import SeqIO
# Parse a FASTA file
records = list(SeqIO.parse("sequences.fasta", "fasta"))
for record in records:
print(f"{record.id}: {len(record.seq)} bp")
# Parse a GenBank file
for record in SeqIO.parse("brca1.gb", "genbank"):
print(record.id)
print(record.description)
print(f"Features: {len(record.features)}")
# Read a single record
record = SeqIO.read("single.fasta", "fasta") # raises ValueError if >1 record
Supported Formats
Common formats and their format strings:
| Format | String | Notes |
|---|---|---|
| FASTA | "fasta" | Most common; no annotations |
| GenBank | "genbank" or "gb" | Full annotations, features |
| FASTQ | "fastq" | Sequences + quality scores |
| EMBL | "embl" | European equivalent of GenBank |
| Stockholm | "stockholm" | Multiple sequence alignments |
| Clustal | "clustal" | Multiple sequence alignments |
Writing Sequences
# Write to FASTA
SeqIO.write(records, "output.fasta", "fasta")
# Convert between formats
with open("output.gb", "w") as f:
SeqIO.write(records, f, "genbank")
# Convert FASTQ → FASTA
records = SeqIO.parse("reads.fastq", "fastq")
SeqIO.write(records, "reads.fasta", "fasta")
Working with GenBank Features
GenBank records contain rich feature annotations. Features are SeqFeature objects with a type, location, and qualifiers:
record = SeqIO.read("brca1.gb", "genbank")
for feature in record.features:
print(f"Type: {feature.type}")
print(f"Location: {feature.location}")
print(f"Strand: {feature.location.strand}") # 1 (forward) or -1 (reverse)
# Filter for CDS features
cds_features = [f for f in record.features if f.type == "CDS"]
for cds in cds_features:
gene = cds.qualifiers.get("gene", ["unknown"])[0]
protein_id = cds.qualifiers.get("protein_id", ["?"])[0]
# Extract the CDS sequence (handles joins and strand)
cds_seq = cds.extract(record.seq)
protein = cds_seq.translate(to_stop=True)
print(f"{gene} ({protein_id}): {len(protein)} aa")
Feature Locations
Feature locations can be:
SimpleLocation(start, end, strand)— a simple intervalCompoundLocation— a join of intervals (for genes with introns, or wrapping around circular chromosomes)
from Bio.SeqFeature import SimpleLocation
feature = record.features[0]
loc = feature.location
print(loc.start) # 0-based start (ExactPosition)
print(loc.end) # 0-based end (ExactPosition)
print(loc.strand) # 1 or -1
# Check if compound (spliced)
from Bio.SeqFeature import CompoundLocation
if isinstance(loc, CompoundLocation):
for part in loc.parts:
print(f" Exon: {part.start}–{part.end}")
Sequence Analysis
GC Content and Composition
from Bio.SeqUtils import gc_fraction
seq = Seq("ATGCGATCGATCGATCG")
gc = gc_fraction(seq)
print(f"GC content: {gc:.1%}") # 47.1%
# Manual composition
from collections import Counter
counts = Counter(str(seq))
print(counts) # Counter({'A': 3, 'T': 4, 'G': 5, 'C': 5, ...})
Codon Usage
from Bio.SeqUtils import CodonAdaptationIndex
from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
# Count codons in a coding sequence
cds = Seq("ATGAAACGTTTTGGG") # must be in-frame, no partial codons
codons = [str(cds[i:i+3]) for i in range(0, len(cds)-3, 3)]
from collections import Counter
codon_counts = Counter(codons)
print(codon_counts)
Finding ORFs
def find_orfs(seq: Seq, min_length: int = 100) -> list:
orfs = []
for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
for frame in range(3):
i = frame
while i < len(nuc) - 2:
codon = nuc[i:i+3]
if codon == "ATG":
# Find the stop codon
for j in range(i, len(nuc) - 2, 3):
if nuc[j:j+3] in ("TAA", "TAG", "TGA"):
orf_len = j + 3 - i
if orf_len >= min_length:
orfs.append({
"start": i,
"end": j + 3,
"strand": strand,
"frame": frame,
"length": orf_len,
"seq": nuc[i:j+3]
})
break
i += 3
return sorted(orfs, key=lambda x: -x["length"])
orfs = find_orfs(Seq("ATGAAACGTTTTGGGTAG"))
for orf in orfs:
print(f"Strand {orf['strand']}, frame {orf['frame']}: {orf['length']} bp")
Sequence Alignment
Pairwise Alignment
Biopython's Bio.Align provides pairwise alignment with configurable scoring:
from Bio import Align
aligner = Align.PairwiseAligner()
aligner.mode = "global" # Needleman-Wunsch
# aligner.mode = "local" # Smith-Waterman
# Default scoring: match=1, mismatch=0, gap=-0.5
aligner.match_score = 2
aligner.mismatch_score = -1
aligner.open_gap_score = -2
aligner.extend_gap_score = -0.5
seq1 = Seq("ATGCGATCG")
seq2 = Seq("ATGCAATCG")
alignments = aligner.align(seq1, seq2)
best = alignments[0]
print(f"Score: {best.score}")
print(best)
BLAST Queries via NCBI
from Bio.Blast import NCBIWWW, NCBIXML
import time
# Run BLAST (qblast = query BLAST via NCBI web service)
result_handle = NCBIWWW.qblast(
"blastn", # program: blastn, blastp, blastx, tblastn, tblastx
"nt", # database: nt, nr, refseq_rna, etc.
"ATGCGATCGATCG", # query sequence
hitlist_size=10
)
# Parse results
blast_records = NCBIXML.parse(result_handle)
blast_record = next(blast_records)
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
print(f"Match: {alignment.title[:60]}")
print(f"Score: {hsp.score}, E-value: {hsp.expect:.2e}")
print(f"Identity: {hsp.identities}/{hsp.align_length}")
break # just first HSP
NCBI's BLAST web service rate-limits heavy use. For production workflows with many sequences, install BLAST+ locally (blast+ package) and run blastn / blastp as command-line tools. Biopython can parse the XML output with NCBIXML:
from Bio.Blast import NCBIXML
with open("blast_output.xml") as f:
records = list(NCBIXML.parse(f))
Working with Multiple Sequence Alignments
After aligning multiple sequences (with ClustalW, MUSCLE, MAFFT, etc.), Biopython can parse and analyze the result:
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
# Parse a Clustal alignment file
alignment = AlignIO.read("sequences.aln", "clustal")
print(f"Alignment length: {alignment.get_alignment_length()}")
print(f"Number of sequences: {len(alignment)}")
for record in alignment:
print(f"{record.id}: {record.seq}")
# Get a column from the alignment
col_10 = alignment[:, 10] # column 10, all sequences
print(f"Column 10: {col_10}")
# Compute conservation per position
def column_identity(alignment, position: int) -> float:
col = alignment[:, position]
most_common = Counter(col).most_common(1)[0][1]
return most_common / len(col)
for i in range(alignment.get_alignment_length()):
identity = column_identity(alignment, i)
if identity < 0.8:
print(f"Position {i}: {identity:.0%} conserved")
Structure Basics with Bio.PDB
For 3D protein structures, Biopython provides a hierarchy: Structure → Model → Chain → Residue → Atom.
from Bio.PDB import PDBParser, MMCIFParser, PDBIO
import urllib.request
# Download and parse
urllib.request.urlretrieve("https://files.rcsb.org/download/1JM7.pdb", "1JM7.pdb")
parser = PDBParser(QUIET=True)
structure = parser.get_structure("1JM7", "1JM7.pdb")
# Navigate the hierarchy
model = structure[0] # first MODEL
chain = model["A"] # chain A
residue = chain[100] # residue 100
atom = residue["CA"] # alpha carbon
print(f"CA coordinates: {atom.get_vector()}")
# All residues in chain A
residues = [r for r in chain.get_residues() if r.get_id()[0] == " "]
print(f"Standard residues in chain A: {len(residues)}")
Computing Distances
def get_ca_distance(res1, res2) -> float:
ca1 = res1["CA"].get_vector()
ca2 = res2["CA"].get_vector()
return (ca1 - ca2).norm()
# Distance between residues 10 and 50
res10 = chain[10]
res50 = chain[50]
dist = get_ca_distance(res10, res50)
print(f"Cα distance: {dist:.2f} Å")
Practical Pipeline: Sequence Analysis Workflow
Here's a complete workflow connecting the pieces:
from Bio import Entrez, SeqIO
from Bio.SeqUtils import gc_fraction
from Bio.Seq import Seq
import time
Entrez.email = "your@email.com"
def analyze_gene(gene_name: str, organism: str = "Homo sapiens") -> dict:
# Fetch from NCBI
handle = Entrez.esearch(
db="nucleotide",
term=f"{gene_name}[Gene Name] AND {organism}[Organism] AND RefSeq[Filter] AND mRNA[Filter]",
retmax=1
)
result = Entrez.read(handle)
handle.close()
if not result["IdList"]:
return {"error": "Gene not found"}
time.sleep(0.4)
handle = Entrez.efetch(db="nucleotide", id=result["IdList"][0], rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
# Find CDS
cds_features = [f for f in record.features if f.type == "CDS"]
analysis = {
"accession": record.id,
"total_length_bp": len(record.seq),
"gc_content": gc_fraction(record.seq),
"cds_count": len(cds_features),
}
if cds_features:
cds_seq = cds_features[0].extract(record.seq)
protein = cds_seq.translate(to_stop=True)
analysis["protein_length_aa"] = len(protein)
analysis["protein_gc"] = gc_fraction(cds_seq)
return analysis
result = analyze_gene("BRCA1")
for key, value in result.items():
print(f"{key}: {value}")
Summary
Biopython's key modules:
| Module | Use |
|---|---|
Bio.Seq | Core sequence object with biological operations |
Bio.SeqIO | Read/write all sequence file formats |
Bio.SeqRecord | Sequences with associated metadata |
Bio.Entrez | Programmatic NCBI database access |
Bio.Align | Pairwise and multiple sequence alignment |
Bio.Blast | BLAST queries and result parsing |
Bio.PDB | 3D structure parsing and analysis |
For almost any biological sequence analysis task in Python, start with Biopython. If you find it doesn't cover your use case, look for dedicated libraries: pyvcf/cyvcf2 for VCF files, pysam for SAM/BAM files, scanpy for single-cell analysis, BioPandas for structure-as-dataframe workflows.