原始测序数据处理

conda create -n myenv python=3.7
conda activate myenv
conda config --add channels bioconda
conda config --add channels conda-forge
conda install multiqc

conda activate sgRNA
# 提取包含 P5 primer 和 gRNA scaffold 序列的 reads
for sample in {01..04}
do
    cutadapt -g ^ttgtggaaaggacgaaacaccg...gttttagagctagaaatagcaagtt -o A${sample}_R1_sgRNAs.fastq.gz A${sample}_R1_001.fastq.gz
    cutadapt -g ^ttgtggaaaggacgaaacaccg...gttttagagctagaaatagcaagtt -o A${sample}_R2_sgRNAs.fastq.gz A${sample}_R2_001.fastq.gz
done

zgrep -o 'TAATATACCTAGCCAGGGTTTTAGAGCT' A01_R1_sgRNAs.fastq.gz | wc -l

~~下载TruSeq3-PE.fa后
for sample in {01..04}
do
    trimmomatic PE -phred33 \\
    "A${sample}_R1_001.fastq.gz" "A${sample}_R2_001.fastq.gz" \\
    "A${sample}_R1_paired.fastq.gz" "A${sample}_R1_unpaired.fastq.gz" \\
    "A${sample}_R2_paired.fastq.gz" "A${sample}_R2_unpaired.fastq.gz" \\
    ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
    fastqc "A${sample}_R1_paired.fastq.gz" "A${sample}_R2_paired.fastq.gz"
done~~

multiqc .

python3 txt_fasta.py#text to fasta

conda activate sgRNA
cd $wd
# 映射到sgRNA
zcat A04_R1_001.fastq.gz > A04_R1_001.fastq
zcat A04_R2_001.fastq.gz > A04_R2_001.fastq

seqtk seq -a A04_R1_001.fastq > A04_R1_001.fasta
seqtk seq -a A04_R2_001.fastq > A04_R2_001.fasta

bowtie2-build A04_R1_001.fasta,A04_R2_001.fasta A04_index

bowtie2 -x A01_index -f target_sequences.fa -S A01_R1_001_alignment.sam --all

SAM文件需要处理:
# 从 SAM 文件中提取头部,并去除重复条目
grep '^@' A01_R1_001_alignment.sam | sort | uniq > header_no_duplicates.sam
# 提取数据部分
grep -v '^@' A01_R1_001_alignment.sam > body.sam
cat header_no_duplicates.sam body.sam > fixed_A01_R1_001_alignment.sam

#初步统计
python tz_claude.py

使用 pandas 进行数据处理,matplotlib 和 seaborn 进行可视化,scipy 计算 Shannon 多样性指数,并使用 fpdf 生成 PDF 报告。

graph TD
  RAW_DATA(SEQ) -->SAM -->PYTHON

必要的库:

pip install pandas matplotlib seaborn scipy fpd

根据科学问题,调整 Python 脚本:

import pysam
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
from fpdf import FPDF
import numpy as np
from collections import defaultdict

# 读取 SAM 文件
def read_sam(sam_file):
    counts = defaultdict(int)
    with pysam.AlignmentFile(sam_file, "r") as sam:
        for read in sam:
            if not read.is_unmapped:
                counts[read.reference_name] += 1
    return pd.DataFrame(list(counts.items()), columns=['sgRNA', 'count'])

# 读取原始 sgRNA 列表
def read_sgrna_list(file_path):
    with open(file_path, 'r') as f:
        sgrnas = [line.strip() for line in f if line.strip()]
    return pd.DataFrame({'sgRNA': sgrnas, 'count': [0] * len(sgrnas)})

# 1. 可视化
def plot_distribution(data, title, filename):
    plt.figure(figsize=(10, 6))
    sns.histplot(data['count'], kde=True)
    plt.title(title)
    plt.xlabel('Read Count')
    plt.ylabel('Frequency')
    plt.savefig(filename)
    plt.close()

def plot_comparison(original, amplified, filename):
    merged = original.merge(amplified, on='sgRNA', how='outer', suffixes=('_original', '_amplified'))
    merged = merged.fillna(0)
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='count_original', y='count_amplified', data=merged)
    plt.title('Original vs Amplified Library Comparison')
    plt.xlabel('Original sgRNA Count')
    plt.ylabel('Amplified Read Count')
    plt.savefig(filename)
    plt.close()

# 2. 质量控制
def calculate_shannon_index(data):
    probabilities = data['count'] / data['count'].sum()
    return entropy(probabilities)

def analyze_uniformity(data):
    cv = data['count'].std() / data['count'].mean()
    return cv

# 3. 报告生成
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'sgRNA Library Analysis Report', 0, 1, 'C')
    
    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

def generate_report(original, amplified, shannon_index, cv):
    pdf = PDF()
    pdf.add_page()
    
    pdf.set_font('Arial', 'B', 16)
    pdf.cell(0, 10, '1. Library Coverage', 0, 1)
    pdf.set_font('Arial', '', 12)
    
    total_original = len(original)
    total_amplified = len(amplified)
    common_sgRNAs = len(set(original['sgRNA']) & set(amplified['sgRNA']))
    
    pdf.multi_cell(0, 10, f"Total sgRNAs in original library: {total_original}")
    pdf.multi_cell(0, 10, f"Total sgRNAs in amplified library: {total_amplified}")
    pdf.multi_cell(0, 10, f"sgRNAs present in both libraries: {common_sgRNAs}")
    pdf.multi_cell(0, 10, f"Percentage of original library in amplified: {common_sgRNAs/total_original*100:.2f}%")
    
    pdf.add_page()
    pdf.set_font('Arial', 'B', 16)
    pdf.cell(0, 10, '2. Library Complexity', 0, 1)
    pdf.set_font('Arial', '', 12)
    pdf.multi_cell(0, 10, f"Shannon Diversity Index: {shannon_index:.4f}")
    pdf.multi_cell(0, 10, f"Coefficient of Variation: {cv:.4f}")
    
    pdf.add_page()
    pdf.set_font('Arial', 'B', 16)
    pdf.cell(0, 10, '3. Visualization', 0, 1)
    pdf.image('amplified_distribution.png', x=10, y=30, w=190)
    
    pdf.add_page()
    pdf.image('library_comparison.png', x=10, y=30, w=190)
    
    pdf.output('sgRNA_library_analysis_report.pdf', 'F')

# Main analysis
def main():
    # Load data
    original = read_sgrna_list('sgRNA_reference_list.txt')
    amplified = read_sam('fixed_A01_R1_001_alignment.sam')
    
    # 1. Visualization
    plot_distribution(amplified, 'Amplified Library Read Count Distribution', 'amplified_distribution.png')
    plot_comparison(original, amplified, 'library_comparison.png')
    
    # 2. Quality Control
    shannon_index = calculate_shannon_index(amplified)
    cv = analyze_uniformity(amplified)
    
    # 3. Report Generation
    generate_report(original, amplified, shannon_index, cv)
    
    print("Analysis complete. Report generated as 'sgRNA_library_analysis_report.pdf'")

if __name__ == "__main__":
    main()

这个脚本执行以下操作:

  1. 读取原始文库和扩增文库的数据。

  2. 创建 sgRNA 覆盖度分布图和原始文库与扩增文库的比较散点图。

  3. 计算 Shannon 多样性指数和变异系数 (CV) 来评估文库复杂度和均匀性。

  4. 生成一个详细的 PDF 报告,包含所有分析结果和图表。

使用方法:

  1. 将原始文库数据保存为 'original_library.txt',扩增文库数据保存为 'amplified_library.txt'。这两个文件应该是制表符分隔的文本文件,每行包含一个 sgRNA 序列和其对应的读数。

  2. 运行脚本:

python sgRNA_library_analysis.py
  1. 脚本将生成以下文件:

    • original_distribution.png

    • amplified_distribution.png

    • library_comparison.png

    • sgRNA_library_analysis_report.pdf

注意事项:

  • 这个脚本假设您的输入文件格式正确。如果文件格式不同,您可能需要调整 read_data 函数。

  • 如果您的数据集很大,可能需要考虑优化内存使用,例如使用 chunking 或 dask。

  • 可视化和报告格式可以根据需要进一步调整。

Could you providle more details on how to customize the visualization and report formatting in the script?

How can I modify the script to handle larger datasets efficiently and optimize memory usage?

Would it be possible to add an analysis of the distribution of read counts per SgRNA in the report?

晚上九点半:这将生成两张图表(sgrna_distribution.png 和 count_distribution.png)并在控制台打印统计结果。

import pysam
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter

# 读取 SAM 文件并计算每个 sgRNA 的出现次数
def count_sgrna_from_sam(sam_file):
    counts = Counter()
    with pysam.AlignmentFile(sam_file, "r") as sam:
        for read in sam:
            if not read.is_unmapped:
                counts[read.reference_name] += 1
    return counts

# 读取目标 sgRNA 序列
def read_target_sequences(fasta_file):
    sequences = []
    with open(fasta_file, 'r') as f:
        for line in f:
            if not line.startswith('>'):
                sequences.append(line.strip())
    return sequences

# 主要分析函数
def analyze_sgrna_counts(sam_file, target_file):
    # 读取数据
    counts = count_sgrna_from_sam(sam_file)
    target_sequences = read_target_sequences(target_file)
    
    # 确保所有目标序列都在计数中,如果没有则设为0
    for seq in target_sequences:
        if seq not in counts:
            counts[seq] = 0
    
    # 转换为 DataFrame
    df = pd.DataFrame.from_dict(counts, orient='index', columns=['counts'])
    df = df.sort_values('counts', ascending=True)
    
    # 计算统计数据
    average = df['counts'].mean()
    std_dev = df['counts'].std()
    lower_bound = max(0, average - std_dev)
    upper_bound = average + std_dev
    in_range_percentage = ((df['counts'] >= lower_bound) & (df['counts'] <= upper_bound)).mean() * 100

    # 绘制分布图 1
    plt.figure(figsize=(12, 6))
    plt.plot(range(len(df)), df['counts'], color='black')
    plt.title('Distribution of sgRNA Counts')
    plt.xlabel('sgRNA Sequences')
    plt.ylabel('Counts')
    plt.savefig('sgrna_distribution.png')
    plt.close()

    # 绘制分布图 2
    plt.figure(figsize=(12, 6))
    sns.histplot(df['counts'], kde=True, log_scale=(False, True))
    plt.title('Distribution of Counts')
    plt.xlabel('Elements')
    plt.ylabel('Normalized Counts')
    plt.savefig('count_distribution.png')
    plt.close()

    # 打印统计结果
    print(f"平均数 (average counts): {average:.4f}")
    print(f"标准差 (standard deviation): {std_dev:.4f}")
    print(f"平均数频数范围 (lower bound to upper bound): {lower_bound:.0f} to {upper_bound:.0f}")
    print(f"根据 counts 平均数计算的在范围内的 sgRNA 百分比为: {in_range_percentage:.5f} %")

    return df

# 运行分析
sam_file = 'fixed_A01_R1_001_alignment.sam'
target_file = 'target_sequences.fa'
result_df = analyze_sgrna_counts(sam_file, target_file)

terminal:python analyze_sgrna.py

四个sam文件累加

处理四个 SAM 文件,合并相同头信息的记录,并累加它们的计数。对于不同文件中出现的不同头信息,我们也需要包含在最终结果中。以下是实现这一功能的 Python 代码:

import pysam
from collections import defaultdict

def merge_sam_files(sam_files):
    # 用于存储所有唯一头信息的计数
    merged_counts = defaultdict(int)
    
    # 遍历所有 SAM 文件
    for sam_file in sam_files:
        with pysam.AlignmentFile(sam_file, "r") as sam:
            for read in sam:
                if not read.is_unmapped:
                    # 使用参考名称(通常是 sgRNA 序列)作为键
                    key = read.reference_name
                    # 累加计数
                    merged_counts[key] += 1
    
    return merged_counts

def write_merged_counts(merged_counts, output_file):
    with open(output_file, 'w') as f:
        f.write("Reference\\tCount\\n")  # 写入标题行
        for ref, count in sorted(merged_counts.items(), key=lambda x: x[1], reverse=True):
            f.write(f"{ref}\\t{count}\\n")

# 主函数
def main():
    # 四个 SAM 文件的路径
    sam_files = [
        "fixed_A01_R1_001_alignment.sam",
        "fixed_A01_R2_001_alignment.sam",
        "fixed_A02_R1_001_alignment.sam",
        "fixed_A02_R2_001_alignment.sam"
    ]
    
    # 合并 SAM 文件
    merged_counts = merge_sam_files(sam_files)
    
    # 将结果写入输出文件
    output_file = "merged_sgrna_counts.tsv"
    write_merged_counts(merged_counts, output_file)
    
    print(f"合并完成。结果已写入 {output_file}")

if __name__ == "__main__":
    main()