Biological networks — protein-protein interactions, gene regulatory relationships, metabolic pathways — are graphs, and graph analysis tools are directly applicable. In this chapter we'll use the STRING database (protein interaction network) with NetworkX (Python graph library) to analyze real biological networks: finding hub proteins, detecting communities, and connecting network structure to biological function.
Setup
pip install networkx matplotlib pandas requests
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import requests
import json
from collections import Counter
The STRING Database
STRING (Search Tool for the Retrieval of Interacting Genes/Proteins) is the most comprehensive protein-protein interaction database. It integrates:
- Experimentally validated interactions (co-immunoprecipitation, yeast two-hybrid)
- Computationally predicted interactions (gene cooccurrence, coexpression)
- Literature-mined interactions (text mining)
- Pathway databases (KEGG, Reactome)
Each interaction has a combined score (0–1000) reflecting evidence strength. Filtering at score ≥ 400 (medium confidence) or ≥ 700 (high confidence) is standard practice.
Fetching a Network from STRING API
def fetch_string_network(
proteins: list[str],
species: int = 9606, # 9606 = Homo sapiens
score_threshold: int = 400
) -> pd.DataFrame:
"""Fetch protein-protein interactions from STRING API."""
url = "https://string-db.org/api/json/network"
params = {
"identifiers": "%0d".join(proteins), # newline-separated
"species": species,
"required_score": score_threshold,
"caller_identity": "bio-for-devs-tutorial"
}
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
if not data:
return pd.DataFrame()
df = pd.DataFrame(data)
return df[["preferredName_A", "preferredName_B", "score"]]
# Fetch the TP53 interaction neighborhood
tp53_neighbors_url = "https://string-db.org/api/json/interaction_partners"
params = {
"identifiers": "TP53",
"species": 9606,
"limit": 50, # top 50 interactors
"required_score": 700,
"caller_identity": "bio-for-devs-tutorial"
}
response = requests.get(tp53_neighbors_url, params=params)
partners = response.json()
# Extract the interactor names
interactors = [p["preferredName_B"] for p in partners]
print(f"Top TP53 interactors: {interactors[:10]}")
# Now fetch the network among TP53 and its partners
all_proteins = ["TP53"] + interactors[:20] # limit for this tutorial
interactions = fetch_string_network(all_proteins, score_threshold=700)
print(f"Fetched {len(interactions)} interactions")
interactions.head()
Building a NetworkX Graph
def build_graph(interactions: pd.DataFrame) -> nx.Graph:
G = nx.Graph()
for _, row in interactions.iterrows():
G.add_edge(
row["preferredName_A"],
row["preferredName_B"],
weight=row["score"] / 1000 # normalize to 0–1
)
return G
G = build_graph(interactions)
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")
print(f"Average degree: {sum(dict(G.degree()).values()) / G.number_of_nodes():.1f}")
print(f"Is connected: {nx.is_connected(G)}")
Basic Network Statistics
Degree Distribution
degrees = dict(G.degree())
degree_values = list(degrees.values())
print("Degree distribution:")
for deg, count in sorted(Counter(degree_values).items()):
bar = "█" * count
print(f" Degree {deg:2d}: {bar} ({count})")
# Top hubs
top_hubs = sorted(degrees.items(), key=lambda x: -x[1])[:10]
print("\nTop hub proteins:")
for protein, deg in top_hubs:
print(f" {protein}: {deg} connections")
Centrality Measures
# Degree centrality: fraction of nodes connected to
degree_centrality = nx.degree_centrality(G)
# Betweenness centrality: fraction of shortest paths passing through node
betweenness = nx.betweenness_centrality(G, weight="weight")
# Closeness centrality: how quickly a node can reach all others
closeness = nx.closeness_centrality(G)
# PageRank: importance based on neighbors' importance (like Google PageRank)
pagerank = nx.pagerank(G, weight="weight")
# Combine into a DataFrame
centrality_df = pd.DataFrame({
"protein": list(degree_centrality.keys()),
"degree_centrality": list(degree_centrality.values()),
"betweenness": [betweenness[p] for p in degree_centrality.keys()],
"closeness": [closeness[p] for p in degree_centrality.keys()],
"pagerank": [pagerank[p] for p in degree_centrality.keys()],
}).sort_values("betweenness", ascending=False)
print(centrality_df.head(10).to_string(index=False))
Degree centrality identifies proteins with many direct interactions (physical hubs). Betweenness centrality identifies proteins that bridge different parts of the network (bottlenecks).
In biology, high-betweenness proteins are often essential — removing them disconnects functional modules. Many drug targets and disease genes are betweenness hubs: they sit at junctions between pathways, making them both important and potentially druggable without disrupting all pathways at once.
Community Detection
Proteins cluster into functional modules — groups that interact more among themselves than with the rest of the network. These often correspond to protein complexes, pathways, or co-regulated gene programs.
# Louvain community detection (requires python-louvain)
# pip install python-louvain
try:
import community as community_louvain
partition = community_louvain.best_partition(G, weight="weight")
modularity = community_louvain.modularity(partition, G, weight="weight")
print(f"Modularity: {modularity:.3f}")
# Organize by community
from collections import defaultdict
communities = defaultdict(list)
for protein, comm_id in partition.items():
communities[comm_id].append(protein)
print("\nCommunities:")
for comm_id, members in sorted(communities.items()):
print(f" Community {comm_id}: {', '.join(sorted(members))}")
except ImportError:
# Fallback: use NetworkX's built-in Girvan-Newman
from networkx.algorithms.community import greedy_modularity_communities
communities_raw = list(greedy_modularity_communities(G, weight="weight"))
print(f"Found {len(communities_raw)} communities")
for i, comm in enumerate(communities_raw):
print(f" Community {i}: {sorted(comm)}")
Shortest Paths and Network Distance
# Shortest path between two proteins
source = "TP53"
target = "BRCA1"
if nx.has_path(G, source, target):
path = nx.shortest_path(G, source, target, weight=None)
print(f"Shortest path from {source} to {target}:")
print(f" {' → '.join(path)} (length: {len(path)-1})")
else:
print(f"No path between {source} and {target}")
# Average shortest path length (only for connected graphs)
if nx.is_connected(G):
avg_path_length = nx.average_shortest_path_length(G)
print(f"Average path length: {avg_path_length:.2f}")
diameter = nx.diameter(G)
print(f"Diameter (longest shortest path): {diameter}")
Visualizing the Network
def visualize_network(G: nx.Graph, communities: dict = None, title: str = "Protein Interaction Network"):
fig, ax = plt.subplots(figsize=(14, 10))
# Layout
pos = nx.spring_layout(G, seed=42, k=2)
# Node colors by community (if provided)
if communities:
color_map = plt.cm.tab20
node_colors = [color_map(communities.get(node, 0) % 20) for node in G.nodes()]
else:
node_colors = "steelblue"
# Node sizes by degree
degrees = dict(G.degree())
node_sizes = [300 + 200 * degrees[node] for node in G.nodes()]
# Edge widths by weight
edge_weights = [G[u][v].get("weight", 0.5) for u, v in G.edges()]
edge_widths = [w * 3 for w in edge_weights]
# Draw
nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.4, edge_color="gray", ax=ax)
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, alpha=0.9, ax=ax)
nx.draw_networkx_labels(G, pos, font_size=8, font_weight="bold", ax=ax)
ax.set_title(title, fontsize=14, pad=20)
ax.axis("off")
plt.tight_layout()
plt.savefig("protein_network.png", dpi=150, bbox_inches="tight")
plt.show()
# Visualize with community coloring
community_partition = {p: c for c, members in enumerate(communities_raw) for p in members}
visualize_network(G, communities=community_partition)
Enrichment Analysis on Communities
Once you've identified network communities, you want to know what biological processes they represent:
def enrichment_via_string_api(proteins: list[str], species: int = 9606) -> pd.DataFrame:
"""Get GO/pathway enrichment for a list of proteins from STRING."""
url = "https://string-db.org/api/json/enrichment"
params = {
"identifiers": "%0d".join(proteins),
"species": species,
"caller_identity": "bio-for-devs-tutorial"
}
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
if not data:
return pd.DataFrame()
df = pd.DataFrame(data)
relevant_cols = ["category", "term", "description", "fdr", "number_of_genes", "inputGenes"]
available_cols = [c for c in relevant_cols if c in df.columns]
return df[available_cols].sort_values("fdr").head(20)
# Analyze top community
top_community = sorted(communities_raw, key=len, reverse=True)[0]
print(f"\nAnalyzing community with {len(top_community)} proteins: {sorted(top_community)}")
enrichment = enrichment_via_string_api(list(top_community))
print(enrichment[["category", "description", "fdr"]].to_string(index=False))
Building a Differential Network
A powerful application: build two networks (e.g., tumor vs. normal) and find interactions that appear/disappear.
def compare_networks(G1: nx.Graph, G2: nx.Graph, label1: str = "Network 1", label2: str = "Network 2"):
edges1 = set(frozenset(e) for e in G1.edges())
edges2 = set(frozenset(e) for e in G2.edges())
only_in_1 = edges1 - edges2
only_in_2 = edges2 - edges1
in_both = edges1 & edges2
print(f"Edges only in {label1}: {len(only_in_1)}")
print(f"Edges only in {label2}: {len(only_in_2)}")
print(f"Shared edges: {len(in_both)}")
print(f"Jaccard similarity: {len(in_both) / len(edges1 | edges2):.3f}")
# Nodes that gained/lost connections
diff_edges = only_in_1 | only_in_2
diff_nodes = Counter()
for edge in diff_edges:
for node in edge:
diff_nodes[node] += 1
print(f"\nMost changed nodes:")
for node, n_changes in diff_nodes.most_common(10):
print(f" {node}: {n_changes} changed connections")
# Example: compare TP53-network at different score thresholds
G_strict = build_graph(fetch_string_network(all_proteins, score_threshold=900))
G_lenient = build_graph(fetch_string_network(all_proteins, score_threshold=400))
compare_networks(G_strict, G_lenient, "High confidence", "Medium confidence")
Complete Pipeline: From Gene List to Network Insight
def analyze_gene_network(
gene_list: list[str],
species: int = 9606,
score_threshold: int = 700
) -> dict:
"""Full pipeline: gene list → network → hubs → communities → enrichment."""
print(f"Fetching interactions for {len(gene_list)} genes...")
interactions = fetch_string_network(gene_list, species, score_threshold)
if interactions.empty:
return {"error": "No interactions found"}
G = build_graph(interactions)
print(f"Network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
# Centrality
betweenness = nx.betweenness_centrality(G)
top_hubs = sorted(betweenness.items(), key=lambda x: -x[1])[:5]
# Communities
from networkx.algorithms.community import greedy_modularity_communities
communities = list(greedy_modularity_communities(G, weight="weight"))
# Enrich largest community
largest_comm = sorted(communities, key=len, reverse=True)[0]
enrichment = enrichment_via_string_api(list(largest_comm)[:20])
top_terms = []
if not enrichment.empty:
top_terms = enrichment["description"].head(5).tolist()
return {
"n_nodes": G.number_of_nodes(),
"n_edges": G.number_of_edges(),
"density": nx.density(G),
"top_hubs": [hub for hub, _ in top_hubs],
"n_communities": len(communities),
"largest_community_size": len(largest_comm),
"top_enriched_terms": top_terms,
}
# Analyze a DNA damage response gene set
ddr_genes = ["TP53", "BRCA1", "BRCA2", "ATM", "ATR", "CHEK1", "CHEK2",
"MDM2", "CDK2", "CDKN1A", "RB1", "E2F1", "PCNA", "RAD51"]
result = analyze_gene_network(ddr_genes)
for key, value in result.items():
print(f"{key}: {value}")
Summary
Key concepts covered:
| Tool | Use |
|---|---|
| STRING API | Fetch protein-protein interactions with evidence scores |
nx.Graph | Build and manipulate biological networks |
| Centrality measures | Identify hub proteins and regulatory bottlenecks |
| Community detection | Find functional modules in the network |
| Enrichment analysis | Interpret communities with biological annotations |
The workflow — fetch interactions → build graph → compute centralities → detect communities → enrich with pathway annotations — is a standard bioinformatics pattern applicable to any type of biological network (TF-gene, metabolic, coexpression).
The power of network analysis is that it reveals relationships that individual gene lists miss. Two genes can both be differentially expressed without interacting; or one hub gene can drive a change in 50 downstream genes, which is invisible when you look at each gene independently.