[ PROMPT_NODE_27170 ]
common_workflows
[ SKILL_DOCUMENTATION ]
# Pysam 常见生物信息学工作流
## 概述
本文档提供了使用 pysam 进行常见生物信息学工作流的实用示例,演示了如何结合不同文件类型和操作。
## 质量控制工作流
### 计算 BAM 统计信息
python
import pysam
def calculate_bam_stats(bam_file):
"""计算 BAM 文件的基本统计信息。"""
samfile = pysam.AlignmentFile(bam_file, "rb")
stats = {
"total_reads": 0,
"mapped_reads": 0,
"unmapped_reads": 0,
"paired_reads": 0,
"proper_pairs": 0,
"duplicates": 0,
"total_bases": 0,
"mapped_bases": 0
}
for read in samfile.fetch(until_eof=True):
stats["total_reads"] += 1
if read.is_unmapped:
stats["unmapped_reads"] += 1
else:
stats["mapped_reads"] += 1
stats["mapped_bases"] += read.query_alignment_length
if read.is_paired:
stats["paired_reads"] += 1
if read.is_proper_pair:
stats["proper_pairs"] += 1
if read.is_duplicate:
stats["duplicates"] += 1
stats["total_bases"] += read.query_length
samfile.close()
# 计算衍生统计信息
stats["mapping_rate"] = stats["mapped_reads"] / stats["total_reads"] if stats["total_reads"] > 0 else 0
stats["duplication_rate"] = stats["duplicates"] / stats["total_reads"] if stats["total_reads"] > 0 else 0
return stats
### 检查参考序列一致性
python
def check_bam_reference_consistency(bam_file, fasta_file):
"""验证 BAM reads 是否与参考基因组匹配。"""
samfile = pysam.AlignmentFile(bam_file, "rb")
fasta = pysam.FastaFile(fasta_file)
mismatches = 0
total_checked = 0
for read in samfile.fetch():
if read.is_unmapped:
continue
# 获取比对区域的参考序列
ref_seq = fasta.fetch(
read.reference_name,
read.reference_start,
read.reference_end
)
# 获取比对到参考序列的 read 序列
aligned_pairs = read.get_aligned_pairs(with_seq=True)
for query_pos, ref_pos, ref_base in aligned_pairs:
if query_pos is not None and ref_pos is not None and ref_base is not None:
read_base = read.query_sequence[query_pos]
if read_base.upper() != ref_base.upper():
mismatches += 1
to