Peak calling using homer combining sicer and macs2

gene_x 0 like s 920 view s

Tags: pipeline, ChIP-seq

  1. nextflow processing data

    V_8_1_6_p601_d8_D1_H3K4me3.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K4me3_D1
    V_8_1_5_p601_d8_D2_H3K4me3.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K4me3_D2
    V_8_1_6_p604_d8_D1_H3K4me3.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K4me3_D1
    V_8_1_5_p604_d8_D2_H3K4me3.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K4me3_D2
    V_8_1_6_p601_d8_D1_H3K27me3.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K27me3_D1
    V_8_1_5_p601_d8_D2_H3K27me3.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K27me3_D2
    V_8_1_6_p604_d8_D1_H3K27me3.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K27me3_D1
    V_8_1_5_p604_d8_D2_H3K27me3.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K27me3_D2
    V_8_1_7_p601_d8_D1_H3K9me3.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K9me3_D1
    V_8_1_7_p601_d8_D2_H3K9me3.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K9me3_D2
    V_8_1_7_p604_d8_D1_H3K9me3.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K9me3_D1
    V_8_1_7_p604_d8_D2_H3K9me3.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K9me3_D2
    V_8_1_8_p601_d8_D1_H3K27ac.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K27ac_D1
    V_8_1_8_p601_d8_D2_H3K27ac.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K27ac_D2
    V_8_1_8_p604_d8_D1_H3K27ac.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K27ac_D1
    V_8_1_8_p604_d8_D2_H3K27ac.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K27ac_D2
    
    nextflow run NGI-ChIPseq/main.nf --reads '/home/jhuang/DATA/Data_Denise_LT_DNA_Binding/ChIPseq_histone_hg38/H3K4me3_H3K27ac__H3K27me3_H3K9me3/Raw_Data_GEO_uploaded/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
    
    nextflow run NGI-ChIPseq/main.nf --reads '/mnt/h1/jhuang/DATA/Data_Denise_LT_DNA_Binding/ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
    
    (DEBUG: Control doesn't work well!)
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_H3K4me1_r1.fastq.gz -> ../Raw_Data_orig/SRR568344_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_H3K4me1_r2.fastq.gz -> ../Raw_Data_orig/SRR568345_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_H3K27ac_r1.fastq.gz -> ../Raw_Data_orig/SRR227397_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_H3K27ac_r2.fastq.gz -> ../Raw_Data_orig/SRR227398_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_Control_r1.fastq.gz -> ../Raw_Data_orig/SRR227590_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_Control_r2.fastq.gz -> ../Raw_Data_orig/SRR227591_1.fastq.gz
    
  2. make homer directories and findPeaks with HOMER under (myperl)

    conda activate myperl
    
    #Why do I need give "-genome hg38" in makeTagDirectory?
    #If you don't provide a genome with the -genome option, HOMER will only count the number of tags in each region without any genomic context or sequence information. #So, it is essential to include this information when creating a tag directory if you plan to perform any genome-based analysis.
    
    makeTagDirectory p601_d8_D1_input ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D2_input ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D1_input ../results/picard/V_8_1_6_p604_d8_D1_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D2_input ../results/picard/V_8_1_5_p604_d8_D2_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D1_H3K4me3 ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D2_H3K4me3 ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D1_H3K4me3 ../results/picard/V_8_1_6_p604_d8_D1_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D2_H3K4me3 ../results/picard/V_8_1_5_p604_d8_D2_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D1_H3K27me3 ../results/picard/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D2_H3K27me3 ../results/picard/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D1_H3K27me3 ../results/picard/V_8_1_6_p604_d8_D1_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D2_H3K27me3 ../results/picard/V_8_1_5_p604_d8_D2_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D1_H3K27ac ../results/picard/V_8_1_8_p601_d8_D1_H3K27ac.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D2_H3K27ac ../results/picard/V_8_1_8_p601_d8_D2_H3K27ac.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D1_H3K27ac ../results/picard/V_8_1_8_p604_d8_D1_H3K27ac.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D2_H3K27ac ../results/picard/V_8_1_8_p604_d8_D2_H3K27ac.dedup.sorted.bam -genome hg38
    
    for sample in p601_d8_D1_input p601_d8_D2_input p604_d8_D1_input p604_d8_D2_input  p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3  p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3  p601_d8_D1_H3K27ac p601_d8_D2_H3K27ac p604_d8_D1_H3K27ac p604_d8_D2_H3K27ac; do
        makeUCSCfile ${sample} -pseudo 1 -bigWig /home/jhuang/REFs/hg38.chromSizes -o auto -style chipseq    -norm 1e7 -normLength 100 -fsize 1
    done
    
    # -- not necessary any more: using MACS2 and SICER instead of using findPeaks --
    # #factor (transcription factor ChIP-Seq, uses -center, output: peaks.txt,  default)
    # #histone (histone modification ChIP-Seq, region based, uses -region -size 500 -L 0, regions.txt)
    # for sample in p601_d8_D1 p601_d8_D2 p604_d8_D1 p604_d8_D2; do
    #   #Finding peaks of size 1000, no closer than 2000
    #   findPeaks ${sample}_H3K4me3 -style factor -size 1000  -o auto -i ${sample}_input
    #   #-minDist <#> (minimum distance between peaks, default: peak size x2)
    #   #findPeaks ${sample}_H3K27me3 -style histone -region -size 3000 -minDist 5000 -o auto -i ${sample}_input
    #   #findPeaks ${sample}_H3K27ac -style factor -size 200 -minDist 200 -o auto -i ${sample}_input
    #   #findPeaks ${sample}_H3K4me1 -style histone -region -size 1000 -minDist 2500 -o auto -i ${sample}_input
    # done
    
    ./p601_d8_D1_H3K4me3/peaks.txt
    ./p601_d8_D2_H3K4me3/peaks.txt
    ./p604_d8_D1_H3K4me3/peaks.txt
    ./p604_d8_D2_H3K4me3/peaks.txt
    
    ./p601_d8_D1_H3K27me3/regions.txt
    ./p601_d8_D2_H3K27me3/regions.txt
    ./p604_d8_D1_H3K27me3/regions.txt
    ./p604_d8_D2_H3K27me3/regions.txt
    
    for dir in p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3; do
    awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/peaks.txt > ${dir}_peaks.bed
    grep -v "#" ${dir}_peaks.bed | sort -k1,1 -k2,2n > ${dir}_sorted_peaks.bed
    done
    
    for dir in p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3; do
    awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/regions.txt > ${dir}_regions.bed
    grep -v "#" ${dir}_regions.bed | sort -k1,1 -k2,2n > ${dir}_sorted_regions.bed
    done
    
    #DEBUG: why the bam files so small?
    makeTagDirectory NHDF-Ad_Control_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_Control_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r2.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K27ac_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K27ac_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r2.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K4me1_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K4me1_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r2.dedup.sorted.bam -genome hg38
    
    NHDF-Ad_Control_r1 NHDF-Ad_Control_r2 NHDF-Ad_H3K27ac_r1 NHDF-Ad_H3K27ac_r2 NHDF-Ad_H3K4me1_r1 NHDF-Ad_H3K4me1_r2
    
    > (myperl) environments for HOMER, ~/Tools/diffreps/bin/diffReps.pl, MACS2, ~/Tools/SICER1.1/SICER/SICER.sh
    
  3. combine the diffReps.pl and HOMER annotatePeaks.pl

    #Dynamic regions were defined as MACS (H3K4me3, H3K27ac) or SICER (H3K4me1, H3K27me3) peaks overlapping significantly (≥ 2-fold change, adjusted P-value ≤ 0.05) up- or down-regulated differentially enriched regions from diffReps in the three pairwise comparisons WAC vs mock, WA314 vs mock and WAC vs WA314.
    
    #STEP1
    #--> not given "--gname hg38"
    ## Step4: Annotate differential sites.
    #unless($noanno or $gname eq ''){
    #        `region_analysis.pl -i $report -r -d refseq -g $gname`;
    #}
    ## Step5: Look for hotspots.
    #unless($nohs){
    #        my $hotspot = $report . '.hotspot';
    #        `findHotspots.pl -d $report -o $hotspot`;
    #}
    ~/Tools/diffreps/bin/diffReps.pl -tr ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bed -co ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed  --report output_results  --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd sharp
    
    #STEP2
    #replace Chr to '#Chr'
    grep -v "#" output_results | sort -k1,1 -k2,2n > output_results_
    awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' output_results_ > H3K4me3.bed
    #grep -v "#" H3K4me3.bed | sort -k1,1 -k2,2n > H3K4me3_sorted_peaks.bed
    
    #STEP3 (under myperl) peak calling macs2 for narrow peaks, CISER for broad peaks!
    #process the output of diffReps.pl to BED file.
    annotatePeaks.pl H3K4me3.bed hg38 > H3K4me3_annotated_peaks.txt
    
  4. combine macs2 to getDifferentialPeaksReplicates.pl

    replace the initial peak identification by using your MACS2 output.
    
    #http://homer.ucsd.edu/homer/ngs/diffExpression.html
    #getDifferentialPeaksReplicates.pl = findPeaks + annotatePeaks.pl + getDiffExpression.pl 
    #annotatePeaks.pl tss hg38 -raw -d H3K4me3-Mock-rep1/ H3K4me3-Mock-rep2/ H3K4me3-WNT-rep1/ H3K4me3-WNT-rep3/ > countTable.peaks.txt
    
    Here's an outline of how we might be able to replace the initial peak identification by using your MACS2 output.
    
    #TODO: using MACS call peaks of the data H3K27ac.
    

4.1. MACS2 peak calling

  #macs2 --> bed --> annotatePeaks.pl
  conda activate ngi_chipseq_ac2
  macs2 callpeak -t ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bam -f BAM -g hs -n p601_d8_D1 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bam -f BAM -g hs -n p601_d8_D2 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_1_6_p604_d8_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_6_p604_d8_D1_input.dedup.sorted.bam -f BAM -g hs -n p604_d8_D1 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_1_5_p604_d8_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_5_p604_d8_D2_input.dedup.sorted.bam -f BAM -g hs -n p604_d8_D2 -q 0.05

  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p601_d8_D1_peaks.narrowPeak > p601_d8_D1_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p601_d8_D2_peaks.narrowPeak > p601_d8_D2_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p604_d8_D1_peaks.narrowPeak > p604_d8_D1_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p604_d8_D2_peaks.narrowPeak > p604_d8_D2_peaks.bed

  #annotatePeaks.pl p601_d8_D1_peaks.bed hg38 > p601_d8_D1_annotated_peaks.txt
  #annotatePeaks.pl p601_d8_D2_peaks.bed hg38 > p601_d8_D2_annotated_peaks.txt
  #annotatePeaks.pl p604_d8_D1_peaks.bed hg38 > p604_d8_D1_annotated_peaks.txt
  #annotatePeaks.pl p604_d8_D2_peaks.bed hg38 > p604_d8_D2_annotated_peaks.txt

4.2. Convert your MACS2 peaks to HOMER-compatible format. You can do this manually or with a script. For example:

It's possible to use more information from the MACS2 output file to create a more informative peaks.txt file for HOMER. However, it's important to note that some information that HOMER needs for its differential peak analysis is not available in the MACS2 output (such as Normalized Tag Count, Control Tags, and others). But we can certainly map more of the available MACS2 columns to the corresponding HOMER columns.

  #The following awk command can be used to convert more MACS2 information into the HOMER format:

  cd macs2
  awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"Peak_"NR,$5,$6,$7,$8,$9,$10}' macs2_peaks.bed > macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p601_d8_D1_peaks.xls > p601_d8_D1_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p601_d8_D2_peaks.xls > p601_d8_D2_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p604_d8_D1_peaks.xls > p604_d8_D1_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p604_d8_D2_peaks.xls > p604_d8_D2_macs2_peaks.txt

  This command will:

      * Skip the header line (NR > 1)
      * Map the MACS2 peak name ($10) to the HOMER PeakID
      * Map the MACS2 chromosome, start, and end ($1, $2, $3) to the HOMER chr, start, end
      * Use a placeholder "+" for the HOMER strand
      * Use a placeholder "0" for the HOMER Normalized Tag Count and Focus Ratio
      * Map the MACS2 pileup ($6) to the HOMER findPeaks Score and Total Tags
      * Use a placeholder "0" for the HOMER Control Tags
      * Map the MACS2 fold_enrichment ($8) to the HOMER Fold Change vs Control
      * Map the MACS2 abs_summit ($5) to the HOMER p-value vs Control
      * Map the MACS2 -log10(qvalue) ($9) to the HOMER Fold Change vs Local
      * Use a placeholder "0" for the HOMER p-value vs Local and Clonal Fold Change

  This script is limited by the differences in the information provided by MACS2 and HOMER. While it makes use of as much information as possible from the MACS2 output, some columns in the HOMER format still have to be filled with placeholder values.

- The following awk command can be used to convert more SICER information into the HOMER format (TODO) oder directly using findPeaks.pl.

- The following awk command can be used to convert more diffReps.pl information into the HOMER format (TODO).

4.3. Associate the converted peak files with their respective tag directories. In HOMER, peak files can be associated with a tag directory by placing them in the tag directory with the filename "peaks.txt".

  mv homer/p601_d8_D1_H3K4me3/peaks.txt homer/p601_d8_D1_H3K4me3/peaks_raw.txt
  mv homer/p601_d8_D2_H3K4me3/peaks.txt homer/p601_d8_D2_H3K4me3/peaks_raw.txt
  mv homer/p604_d8_D1_H3K4me3/peaks.txt homer/p604_d8_D1_H3K4me3/peaks_raw.txt
  mv homer/p604_d8_D2_H3K4me3/peaks.txt homer/p604_d8_D2_H3K4me3/peaks_raw.txt
  cp macs2/p601_d8_D1_macs2_peaks.txt homer/p601_d8_D1_H3K4me3/peaks.txt
  cp macs2/p601_d8_D2_macs2_peaks.txt homer/p601_d8_D2_H3K4me3/peaks.txt
  cp macs2/p604_d8_D1_macs2_peaks.txt homer/p604_d8_D1_H3K4me3/peaks.txt
  cp macs2/p604_d8_D2_macs2_peaks.txt homer/p604_d8_D2_H3K4me3/peaks.txt

  #Repeat this for each of your tag directories.

4.4. The program getDifferentialPeaksReplicates will essentially perform 3 steps, in the step 2 was modified.

First, it will pool the target tag directories and input directories separately into pooled experiments and perform an initial peak identification (using findPeaks). Pooling the experiments is generally more sensitive than trying to merge the individual peak files coming from each experiment (although this can be done using the "-use " option if each directory already has a peak file associated with it). Next, it will quantify the reads at the initial putative peaks across each of the target and input tag directories using annotatePeaks.pl. Finally, it calls getDiffExpression.pl and ultimately passes these values to the R/Bioconductor package DESeq2 to calculate enrichment values for each peak, returning only those peaks that pass a given fold enrichment (default: 2-fold) and FDR cutoff (default 5%). We can run getDifferentialPeaksReplicates.pl with the -use option to specify that the provided peaks should be used instead of calling findPeaks:

    #-- Successful modification of the script getDifferentialPeaksReplicates.pl --
    #The -d parameter in the mergePeaks function in HOMER is used to specify the maximum distance between peak centers
    #change Max distance to merge to 30000 bp in getDifferentialPeaksReplicates.pl
    #mergePeaks -d 30000 temp_sorted | sort

    #conda list homer  #4.11
    mergePeaks p601_d8_D1_H3K27me3/peaks.txt p601_d8_D2_H3K27me3/peaks.txt >  mergePeaks_res.txt
    python3 update_header.py
    cat p601_d8_D1_H3K27me3/peaks.txt p601_d8_D2_H3K27me3/peaks.txt > temp
    awk '{print $2 "\t" $3 "\t" $4 "\t" $1}' temp | sort -k1,1 -k2,2n | bedtools merge -d 1000 > bedtools_res.txt
    python3 adjust_mergePeaks_res.py

    #check if the results are correct
    cut -d$'\t' -f2-4 filtered_mergePeaks_res.txt > control1
    diff control1 bedtools_res.txt

  #(myperl) jhuang@hamburg:~/DATA/Data_Denise_LT_DNA_Bindung/results_chipseq_histone_hg38/H3K4me3_H3K27ac__H3K27me3_H3K9me3/homer$
  #getDifferentialPeaksReplicates.pl -use <tag_directory1>/peaks.txt,<tag_directory2>/peaks.txt ...
  #TOO TIME_CONSUMING, using original version
  getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks.txt
  getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks.txt
  #Remember to replace <tag_directory1>/peaks.txt,<tag_directory2>/peaks.txt ... with the path to your own tag directories and peak files.

4.5. Draw plots

  import matplotlib.pyplot as plt
  from matplotlib_venn import venn2
  venn2(subsets=(2476, 3567, 22719), set_labels=('Donor 1', 'Donor 2'))
  plt.title('Peaks between p601 d8 H3K4me3 Donors')
  plt.xlabel('Number of Elements')
  plt.ylabel('')
  plt.savefig('Peak_Consistency_Between_p601_d8_H3K4me3_Donors.png', dpi=300, bbox_inches='tight')
  #2476+3567+22719=28762
  venn2(subsets=(2681, 3410, 19044), set_labels=('Donor 1', 'Donor 2'))
  plt.title('Peaks between p604 d8 H3K4me3 Donors')
  plt.xlabel('Number of Elements')
  plt.ylabel('')
  plt.savefig('Peak_Consistency_Between_p604_d8_H3K4me3_Donors.png', dpi=300, bbox_inches='tight')
  #2681+3410+19044=25135

  awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p601_d8_H3K4me3_macs2_peaks.txt > p601_d8_H3K4me3_macs2_peaks.bed
  awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p604_d8_H3K4me3_macs2_peaks.txt > p604_d8_H3K4me3_macs2_peaks.bed
  ~/Tools/csv2xls-0.4/csv_to_xls.py p601_d8_H3K4me3_macs2_peaks.txt -d$'\t' -o p601_d8_H3K4me3_macs2_peaks.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py p604_d8_H3K4me3_macs2_peaks.txt -d$'\t' -o p604_d8_H3K4me3_macs2_peaks.xls

5.1. SICER peak calling (under env myperl)

  #SICER --> bed --> annotatePeaks.pl
  mkdir sicer; cd sicer;
  ln -s ../results/picard/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted.bed .
  ln -s ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed .
  ln -s ../results/picard/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted.bed .
  ln -s ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed .
  ln -s ../results/picard/V_8_1_6_p604_d8_D1_H3K27me3.dedup.sorted.bed .
  ln -s ../results/picard/V_8_1_6_p604_d8_D1_input.dedup.sorted.bed .
  ln -s ../results/picard/V_8_1_5_p604_d8_D2_H3K27me3.dedup.sorted.bed .
  ln -s ../results/picard/V_8_1_5_p604_d8_D2_input.dedup.sorted.bed .

  #chr10:49,003,170-51,222,175
  #chr10:48,528,307-50,747,312
  #chr10:58,422,744-60,641,749
  mkdir p601_d8_D1 p601_d8_D2 p604_d8_D1 p604_d8_D2
  #/home/jhuang/Tools/SICER1.1/SICER/SICER.sh [InputDir] [bed file] [control file] [OutputDir] [Species] [redundancy threshold] [window size (bp)] [fragment size] [effective genome fraction] [gap size (bp)] [FDR]
  # 10000 is window size, 30000 is the gap size, 160 is the fragment size
  ~/Tools/SICER1.1/SICER/SICER.sh . V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted.bed V_8_1_6_p601_d8_D1_input.dedup.sorted.bed p601_d8_D1 hg38 1 10000 160 0.74 30000 0.01;
  ~/Tools/SICER1.1/SICER/SICER.sh . V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted.bed V_8_1_5_p601_d8_D2_input.dedup.sorted.bed p601_d8_D2 hg38 1 10000 160 0.74 30000 0.01;
  ~/Tools/SICER1.1/SICER/SICER.sh . V_8_1_6_p604_d8_D1_H3K27me3.dedup.sorted.bed V_8_1_6_p604_d8_D1_input.dedup.sorted.bed p604_d8_D1 hg38 1 10000 160 0.74 30000 0.01;
  ~/Tools/SICER1.1/SICER/SICER.sh . V_8_1_5_p604_d8_D2_H3K27me3.dedup.sorted.bed V_8_1_5_p604_d8_D2_input.dedup.sorted.bed p604_d8_D2 hg38 1 10000 160 0.74 30000 0.01;
  #TODO: - check if the peak calling works well in IGV!
  # - call peaks for H3K27me1, adjust the SICER-parameters!
  # - transform them peaks.txt of HOMER, and call getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K27me3_sicer_peaks.txt
  getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K27me3_sicer_peaks.txt
  # - write a mail to Denise, sending the results of bigWig-files of H3K4me3 and H3K27me3, and called peaks. By the way, asks if she needs the Gene members of the red colors in the PCA plot!

  #Note that histone using 'cat file1.bed file2.bed | sort -k1,1 -k2,2n | bedtools merge > merged.bed'
  #Note that factor using HOMER getDifferentialPeaksReplicates from begining: "mergePeaks.pl --> DESeq recheck"!

  mergePeaks sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed -d 10000 > mergedPeaks1.txt

  cat sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed | sort -k1,1 -k2,2n | bedtools merge > merged_p601_d8.bed
  #4957 (4388, 4822)

  awk 'BEGIN{OFS="\t"; print "PeakID\tchr\tstart\tend\tstrand"} {print "Peak-"NR, $1, $2, $3, "."}' merged_p601_d8.bed > homer_peaks.bed
  #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!!!!!!: The bedtools merge command does not generate output in the HOMER peak format directly. However, you can use a combination of Unix command line tools (e.g., awk, sort) to convert bedtools output into HOMER format.
  #bedtools merge -i input.bed > merged.bed
  #awk 'BEGIN{OFS="\t"; print "PeakID\tchr\tstart\tend\tstrand"} {print "Peak-"NR, $1, $2, $3, "."}' merged.bed > homer_peaks.bed

5.2-3. Convert your SICER peaks to HOMER-compatible format, and replace them in homer/${sample}/peaks.txt. You can do this manually or with a script.

- The following awk command can be used to convert more MACS2 information into the HOMER format:

  mv homer/p601_d8_D1_H3K27me3/regions.txt homer/p601_d8_D1_H3K27me3/regions_raw.txt
  mv homer/p601_d8_D2_H3K27me3/regions.txt homer/p601_d8_D2_H3K27me3/regions_raw.txt
  mv homer/p604_d8_D1_H3K27me3/regions.txt homer/p604_d8_D1_H3K27me3/regions_raw.txt
  mv homer/p604_d8_D2_H3K27me3/regions.txt homer/p604_d8_D2_H3K27me3/regions_raw.txt

  #NOTE that we should name the file as peaks.txt, since the Input-samples contain only peaks.txt.
  awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p601_d8_D1_H3K27me3/peaks.txt
  awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p601_d8_D2_H3K27me3/peaks.txt
  awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p604_d8_D1/V_8_1_6_p604_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p604_d8_D1_H3K27me3/peaks.txt
  awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p604_d8_D2/V_8_1_5_p604_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p604_d8_D2_H3K27me3/peaks.txt

  #The format is similar to the following.
  #chr1-83000-89999        chr1    83000   89999   +       21      0       0       21      27      0       5.623763510806874e-10   0       0       0
  #chr1-859000-865999      chr1    859000  865999  +       23      0       0       23      28      0       3.501056563888787e-11   0       0       0

  This command will:
      * PeakID: Format it as "chr-start-end".
      * chr: Same as SICER.
      * start: Same as SICER.
      * end: Same as SICER.
      * strand: SICER doesn't provide strand information, we could fill with a default value '+'. 
      * Normalized Tag Count: Same as the normalized read count column in SICER.
      * focus ratio: SICER doesn't provide this, we could fill with a default value '0'.
      * findPeaks Score: Corresponds to Fold Change or FDR column in SICER.
      * Total Tags: SICER doesn't provide this, we could fill with a default value '0'.
      * Control Tags (normalized to IP Experiment): Same as Clonal reads or local background column in SICER.
      * Fold Change vs Control: SICER doesn't provide this, we could fill with a default value '0'.
      * p-value vs Control: Same as the p-value column in SICER.
      * Fold Change vs Local: SICER doesn't provide this, we could fill with a default value '0'.
      * p-value vs Local: SICER doesn't provide this, we could fill with a default value '0'.
      * Clonal Fold Change: SICER doesn't provide this, we could fill with a default value '0'.

5.4. The program getDifferentialPeaksReplicates will essentially perform 3 steps, in the step 2 was modified.

    conda activate myperl
    cd homer
    #diff /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl /home/jhuang/homer/bin/getDifferentialPeaksReplicates.pl
    #vim /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl #max_distance_to_merge at line 203 to 133000, "-d <#>"
    ./getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K27me3_sicer_regions.txt
    ./getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K27me3_sicer_regions.txt

5.5. Draw plots

    grep "p601_d8_D1_H3K27me3/peaks.txt|p601_d8_D2_H3K27me3/peaks.txt" p601_d8_H3K27me3_sicer_regions.txt | wc -l  # 799
    grep "p601_d8_D1_H3K27me3/peaks.txt" p601_d8_H3K27me3_sicer_regions.txt | wc -l                                # 867 - 799 = 68
    grep "p601_d8_D2_H3K27me3/peaks.txt" p601_d8_H3K27me3_sicer_regions.txt | wc -l                                # 3019 - 799 = 2220
    # 68 + 799 + 2220 = 3087
    grep "p604_d8_D1_H3K27me3/peaks.txt|p604_d8_D2_H3K27me3/peaks.txt" p604_d8_H3K27me3_sicer_regions.txt | wc -l  # 677
    grep "p604_d8_D1_H3K27me3/peaks.txt" p604_d8_H3K27me3_sicer_regions.txt | wc -l                                # 740 - 677 = 63
    grep "p604_d8_D2_H3K27me3/peaks.txt" p604_d8_H3K27me3_sicer_regions.txt | wc -l                                # 2393 - 677 = 1716
    # 63 + 677 + 1716 = 2456

  import matplotlib.pyplot as plt
  from matplotlib_venn import venn2
  venn2(subsets=(2476, 3567, 22719), set_labels=('Donor 1', 'Donor 2'))
  plt.title('Peaks between p601 d8 H3K27me3 Donors')
  plt.xlabel('Number of Elements')
  plt.ylabel('')
  plt.savefig('Peak_Consistency_Between_p601_d8_H3K27me3_Donors.png', dpi=300, bbox_inches='tight')
  #2476+3567+22719=28762
  venn2(subsets=(2681, 3410, 19044), set_labels=('Donor 1', 'Donor 2'))
  plt.title('Peaks between p604 d8 H3K27me3 Donors')
  plt.xlabel('Number of Elements')
  plt.ylabel('')
  plt.savefig('Peak_Consistency_Between_p604_d8_H3K27me3_Donors.png', dpi=300, bbox_inches='tight')
  #2681+3410+19044=25135

  awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p601_d8_H3K27me3_sicer_regions.txt > p601_d8_H3K27me3_sicer_regions.bed
  awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p604_d8_H3K27me3_sicer_regions.txt > p604_d8_H3K27me3_sicer_regions.bed
  ~/Tools/csv2xls-0.4/csv_to_xls.py p601_d8_H3K27me3_sicer_regions.txt -d$'\t' -o p601_d8_H3K27me3_sicer_regions.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py p604_d8_H3K27me3_sicer_regions.txt -d$'\t' -o p604_d8_H3K27me3_sicer_regions.xls
  1. getDifferentialPeaksReplicates.pl

    #!/usr/bin/env perl
    use warnings;
    use lib "/home/jhuang/homer/.//bin";
    my $homeDir = "/home/jhuang/homer/./";
    
    my $foldThresh = 2;
    my $fdrThresh = 0.05;
    my $peakFoldInput = 2;
    my $peakFdrInput = 0.001;
    my $style = "";
    
    sub printCMD {
        print STDERR "\n\tUsage: getDifferentialPeaksReplicates.pl [options] -t <IP tagdir1> [IP tagdir2] ...\n";
        print STDERR "\t                                         -b <background tagdir1> [background tagdir2] ...\n";
        print STDERR "\t                                         -i <Input tagdir1> [Input tagdir2] ...\n";
        print STDERR "\t\tNote: if input is provided, peaks will be called.\n";
        print STDERR "\n\tOptions:\n";
        #print STDERR "\t\t-F <#> (fold enrichment over bg, default: $foldThresh)\n";
        #print STDERR "\t\t-fdr <#> (FDR over input, default: $fdrThresh)\n";
        print STDERR "\t\t-f <#> (fold enrichment over bg, default: $foldThresh)\n";
        print STDERR "\t\t-q <#> (FDR over bg, default: $fdrThresh)\n";
        print STDERR "\t\t-fdr <#>, -F <#>, -L <#> (parameters for findPeaks)\n";
        print STDERR "\t\t-genome <genome version> (genome version to use for annotation)\n";
        print STDERR "\t\t-DESeq2 | -DESeq | -edgeR (differential stats algorithm, default: DESeq2)\n";
        print STDERR "\t\t-balanced (normalize signal across peaks, default: normalize to read totals)\n";
        print STDERR "\t\t-fragLength <#> (standardize estimated fragment length across analysis)\n";
        print STDERR "\t\t-all (report all peaks, not just differentially regulated)\n";
        print STDERR "\n\tPeak finding directives:\n";
        print STDERR "\t\t-style <factor|histone|tss> (findPeaks style to use for finding peaks)\n";
        print STDERR "\t\t-use <peaks.txt|regions.txt|tss.txt> (use existing peaks in tag directories)\n";
        print STDERR "\t\t-p <peak file> (use specific peak file instead of tagDir/peaks.txt or finding new one)\n";
        print STDERR "\t\tOther options will be passed to findPeaks\n";
        print STDERR "\n";
        exit;
    
    }
    
    my @targets = ();
    my @background = ();
    my @inputs = ();
    my $findPeaksOpts = "";
    my $use = "";
    my $givenPeakFile = '';
    my $norm2total = "-norm2total";
    my $diffAlg = "-DESeq2";
    my $genome = 'none';
    my $annOptions = "";
    my $fragLength = '';
    my $allFlag = 0;
    my $ogCmd = "getDifferentialPeaksReplicates.pl";
    for (my $i=0;$i<@ARGV;$i++) {
        $ogCmd .= " " . $ARGV[$i];
    }
    
    for (my $i=0;$i<@ARGV;$i++) {
        if ($ARGV[$i] eq '-t') {
            $i++;
            while ($i < @ARGV) {
                if ($ARGV[$i] =~ /^-/) {
                    $i--;
                    last;
                }
                push(@targets, $ARGV[$i++]);
            }
        } elsif ($ARGV[$i] eq '-i') {
            $i++;
            while ($i < @ARGV) {
                if ($ARGV[$i] =~ /^-/) {
                    $i--;
                    last;
                }
                push(@inputs, $ARGV[$i++]);
            }
        } elsif ($ARGV[$i] eq '-b') {
            $i++;
            while ($i < @ARGV) {
                if ($ARGV[$i] =~ /^-/) {
                    $i--;
                    last;
                }
                push(@background, $ARGV[$i++]);
            }
        } elsif ($ARGV[$i] eq '-f') {
            $foldThresh = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-genome') {
            $genome = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-q') {
            $fdrThresh = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-use') {
            $use = $ARGV[++$i];
            if ($use eq 'tss.txt') {
                $annOptions .= " -fragLength 1 -strand +";
                $fragLength = " -fragLength 1" if ($fragLength eq '');
            }
        } elsif ($ARGV[$i] eq '-p') {
            $givenPeakFile = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-style') {
            $style = $ARGV[++$i];
            if ($style eq 'tss') {
                $annOptions .= " -fragLength 1 -strand +";
                $fragLength = " -fragLength 1" if ($fragLength eq '');
            }
        } elsif ($ARGV[$i] eq '-edgeR') {
            $diffAlg = $ARGV[$i];
        } elsif ($ARGV[$i] eq '-fragLength') {
            $fragLength = " -fragLength $ARGV[++$i]";
        } elsif ($ARGV[$i] eq '-DESeq2') {
            $diffAlg = $ARGV[$i];
        } elsif ($ARGV[$i] eq '-all') {
            $allFlag = 1;
        } elsif ($ARGV[$i] eq '-DESeq') {
            $diffAlg = $ARGV[$i];
        } elsif ($ARGV[$i] eq '-balanced') {
            $norm2total = "";
        } elsif ($ARGV[$i] eq '-h' || $ARGV[$i] eq '--help' || $ARGV[$i] eq '--') {
            printCMD();
        } else {
            $findPeaksOpts .= " " . $ARGV[$i];
            #print STDERR "!!! \"$ARGV[$i]\" not recognized\n";
            #printCMD();
        }
    }
    my $rand = rand();
    my %toDelete = ();
    if ($diffAlg eq '-edgeR' && $norm2total ne '') {
        print STDERR "!!! Error, -edgeR requires \"-balanced\" to work correctly!!!\n";
        exit;
    }
    $log2Thresh = log($foldThresh)/log(2);
    if (@targets < 1) {
        print STDERR "!!! Error, need at least one target directory!!!\n";
        printCMD();
    }
    if (scalar(@inputs) + scalar(@background) < 1) {
        print STDERR "\t!!! Error: program requires either input or background experiments to perform differential calculations!\n";
        exit;
    }
    my $targetDirs = '';
    my $targetStr = "";
    foreach(@targets) {
        $targetDirs .= " \"$_\"";
        $targetStr .= " target";
    }
    my $inputDirs = '';
    my $inputStr = "";
    foreach(@inputs) {
        $inputDirs .= " \"$_\"";
        $inputStr .= " input";
    }
    my $bgDirs = '';
    my $bgStr = "";
    foreach(@background) {
        $bgDirs .= " \"$_\"";
        $bgStr .= " bg";
    }
    
    if ($givenPeakFile eq '') {
        $peakFile = $rand . ".peaks";
        $toDelete{$peakFile}=1;
    } else {
        $peakFile = $givenPeakFile;
    }
    
    if ($findPeaksOpts ne '') {
        print STDERR "\tUsing the following extra parameters for findPeaks:\n\t\t$findPeaksOpts\n";
    }
    
    print STDERR "\tStep1: Defining Putative Peak Set\n";
    if ($use eq '' && $givenPeakFile eq '') {
        print STDERR "\t\tFinding peaks in merged meta-experiment from target tag directories\n";
        my $targetDir = $rand . ".targetTagDir";
        `makeTagDirectory \"$targetDir\" -d $targetDirs $fragLength`;
        $toDelete{$targetDir}=1;
        my $inputDir = $rand . ".inputTagDir";
        my $cmd = "findPeaks \"$targetDir\"";
        if ($style eq '') {
            $style = 'factor';
            print STDERR "\tUsing -style $style...\n";
        }
        $cmd .= " -style $style";
        $cmd .= $fragLength . " " . $findPeaksOpts;
        if (@inputs > 0) {
            `makeTagDirectory \"$inputDir\" -d $inputDirs $fragLength`;
            $toDelete{$inputDir}=1;
            $cmd .= " -i \"$inputDir\"";
        }
        #print STDERR "`$cmd > \"$peakFile\"`\n";
        `$cmd > \"$peakFile\"`;
    } elsif ($givenPeakFile eq '' && $use ne '') {
        my $files = "";
        my @allDirs = ();
        push(@allDirs, @targets, @inputs, @background);
        print STDERR "\t\tUsing existing peak files for features:\n";
        foreach(@allDirs) {
            my $p = $_ . "/" . $use;
            if (-e $p) {
                print STDERR "\t\t\t$p\n";
                $files .= " \"$p\"";
            }
        }
        #MODIFIED
        #`mergePeaks $files > \"$peakFile\"`;
        #print "$files\n";
        #print "\"$peakFile\"\n";
        `mergePeaks $files -d 1000 >  mergePeaks_res.txt`;
        `python3 update_header.py`;
        `cat $files > merged_peaks.txt`;
        ## Split the list of files into an array
        #my @files = split / /, $files;
        # Open the output file
        #open(my $out, '>', 'merged_peaks.txt') or die "Cannot open merged_peaks.txt: $!";
        #foreach my $file (@files) {
        #   print("xxxxx$file\n");
        #   open(my $in, '<', $file) or die "Cannot open $file: $!";
        #   while (<$in>) {
        #       s/\t+$//; # removes trailing tabs
        #       print $out "$_\n";
        #   }
        #   close $in;
        #}
        #close $out;
        system("awk '{print \$2 \"\\t\" \$3 \"\\t\" \$4 \"\\t\" \$1}' merged_peaks.txt | sort -k1,1 -k2,2n | bedtools merge -d 1000 > bedtools_res.txt");
        `python3 adjust_mergePeaks_res.py`;
        `mv filtered_mergePeaks_res.txt \"$peakFile\"`;
    }
    
    $rawFile= $rand . ".raw.txt";
    $diffFile= $rand . ".diff.txt";
    $upFile = $rand . ".Up_target_vs_bg.txt";
    $downFile = $rand . ".Down_target_vs_bg.txt";
    if ($bgDirs eq '') {
        $bgDirs = $inputDirs;
        $bgStr = $inputStr;
        @background = @inputs;
        $upFile = $rand . ".Up_target_vs_input.txt";
        $downFile = $rand . ".Down_target_vs_input.txt";
    }
    
    print STDERR "\n\tStep2: Quantifying reads across target/background/input tag directories\n\n";
    #print STDERR "`annotatePeaks.pl \"$peakFile\" none -d $bgDirs $targetDirs -raw > \"$rawFile\"`;\n";
    `annotatePeaks.pl \"$peakFile\" $genome -d $bgDirs $targetDirs -raw $annOptions $fragLength > \"$rawFile\"`;
    #print STDERR "`getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;\n";
    #
    
    print STDERR "\n\tStep3: Calling R for differential enrichment statistics ($diffAlg)\n\n";
    `getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;
    $toDelete{$rawFile}=1;
    $toDelete{$diffFile}=1;
    $toDelete{$upFile}=1;
    $toDelete{$downFile}=1;
    print "#cmd=$ogCmd|";
    my $ofile = $upFile;
    $ofile = $diffFile if ($allFlag);
    open IN, $ofile;
    while (<IN>) {
        print $_;
    }
    close IN;
    
    foreach(keys %toDelete) {
        next if ($_ eq '/');
        #print STDERR "\trm -r \"$_\"\n";
        `rm -r "$_"`;
    }
    
  2. update_header.py

    import os
    
    # File path
    file_path = 'mergePeaks_res.txt'
    
    # Read in the file
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    # Split the first line into components
    components = lines[0].split("\t")
    
    # Change the first component to "name"
    components[0] = "name"
    
    # Join the components back into a single string
    lines[0] = "\t".join(components)
    
    # Write the file back out
    with open(file_path, 'w') as file:
        file.writelines(lines)
    
  3. adjust_mergePeaks_res.py

    import pandas as pd
    
    # Read the mergePeaks result file and bedtools result file
    mergePeaks_df = pd.read_csv('mergePeaks_res.txt', sep='\t', header=0)
    bedtools_df = pd.read_csv('bedtools_res.txt', sep='\t', header=None, names=['chr', 'start', 'end'])
    
    # Function to check if a peak is within bedtools ranges
    # def is_in_bedtools_range(row):
    filtered_mergePeaks_df = []
    for _, row in mergePeaks_df.iterrows():
        for _, bed_row in bedtools_df.iterrows():
            if row['chr'] == bed_row['chr'] and row['start'] >= bed_row['start'] and row['end'] <= bed_row['end']:
                row['start'] = bed_row['start']
                row['end'] = bed_row['end']
                filtered_mergePeaks_df.append(row)
    
    # Convert the filtered results to a DataFrame
    filtered_mergePeaks_df = pd.DataFrame(filtered_mergePeaks_df)
    
    ## Write the filtered results to a file
    #filtered_mergePeaks_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False)
    
    ## Filter the mergePeaks rows
    #filtered_mergePeaks_df = mergePeaks_df[mergePeaks_df.apply(is_in_bedtools_range, axis=1)]
    
    ## Sort and drop duplicates
    sorted_df = filtered_mergePeaks_df.sort_values(by=['chr', 'start', 'end'])
    deduplicated_df = sorted_df.drop_duplicates(subset=['chr', 'start', 'end'])
    
    ## Save to file
    deduplicated_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False)
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum