ChIP-seq using HOMER (-stype histone, macs2 or SICER + custom getDifferentialPeaksReplicates_broad[narrow].pl)

gene_x 0 like s 536 view s

Tags: pipeline, ChIP-seq

  1. nextflow processing data

    (chipseq) jhuang@hamm:/mnt/h1/jhuang/DATA/Data_Denise_LT_DNA_Binding/Histone-ChIP_hg38/H3K27ac_H3K4me1_public$ nextflow run NGI-ChIPseq/main.nf --reads '/mnt/h1/jhuang/DATA/Data_Denise_LT_DNA_Binding/Histone-ChIP_hg38/H3K27ac_H3K4me1_public/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
    
    ./picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bam
    ./picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bam
    ./picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bam
    ./picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bam
    
    ./picard/V_8_3_1_p600_601_d12_D1_H3K4me3.dedup.sorted.bam
    ./picard/V_8_3_2_p600_601_d9_D2_H3K4me3.dedup.sorted.bam
    ./picard/V_8_4_2_p602_d8_D1_H3K4me3.dedup.sorted.bam
    ./picard/V_8_4_1_p602_d8_D2_H3K4me3.dedup.sorted.bam
    
    ./picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bam
    ./picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bam
    ./picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bam
    ./picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bam
    
  2. make homer directories and findPeaks with HOMER under (conda homer, but note that homer is installed under Tools manually!)

    #HOMER will be installed in the same directory that you place the configureHomer.pl program.  configureHomer.pl will attempt to check for required utilities and alert you to missing programs.
    1. mkdir ~/Tools/homer; cd ~/Tools/homer
    2. wget http://homer.ucsd.edu/homer/configureHomer.pl
    3. Run the configureHomer.pl script to install homer: perl configureHomer.pl -install
    4. Add the homer/bin directory to your executable path.  For example, edit your ~/.bashrc file to include the line:
    PATH=$PATH:/home/jhuang/Tools/homer/bin/
    5. Reset your terminal so that the changes to the PATH variable take effect: source ~/.bashrc
    
    #jhuang@hamm:~/Tools/homer/bin$ wget http://homer.ucsd.edu/homer/configureHomer.pl
    #configureHomer.pl -install
    #chmod +x /home/jhuang/miniconda3/envs/homer/bin/configureHomer.pl
    #ls ~/miniconda3/envs/homer/bin
    #echo $PATH
    #If /home/jhuang/miniconda3/envs/homer/bin is not in the output, you'll need to add it to your $PATH.
    
    chmod +x ~/Tools/homer/configureHomer.pl
    ./configureHomer.pl -install hg38
    ./configureHomer.pl -list
    
    #conda update -n base -c defaults conda
    conda create -n homer
    conda activate homer
    #(NOT_USE!) conda install -c bioconda homer
    mamba install r-essentials bioconductor-deseq2 
    mamba install samtools
    #bioconductor-edger wget 
    #http://homer.ucsd.edu/homer/introduction/update.html
    #http://homer.ucsd.edu/homer/introduction/install.html
    conda install -c bioconda ucsc-bedgraphtobigwig # install bedGraphToBigWig on @homer
    conda install -c bioconda macs2
    conda install -c anaconda pandas
    
    #under (myperl) on computer @hamburg
    mkdir homer bigWigs macs2 sicer
    cd homer
    
    #Why do I need give "-genome hg38" in makeTagDirectory? If you don't provide a genome with the -genome option, HOMER will only count the number of tags in each region without any genomic context or sequence information. #So, it is essential to include this information when creating a tag directory if you plan to perform any genome-based analysis.
    
    makeTagDirectory p600_601_d12_D1_input ../results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p600_601_d9_D2_input ../results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D1_input ../results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D2_input ../results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bam -genome hg38
    
    makeTagDirectory p600_601_d12_D1_H3K4me3 ../results/picard/V_8_3_1_p600_601_d12_D1_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p600_601_d9_D2_H3K4me3 ../results/picard/V_8_3_2_p600_601_d9_D2_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D1_H3K4me3 ../results/picard/V_8_4_2_p602_d8_D1_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D2_H3K4me3 ../results/picard/V_8_4_1_p602_d8_D2_H3K4me3.dedup.sorted.bam -genome hg38
    
    makeTagDirectory p600_601_d12_D1_H3K27me3 ../results/picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p600_601_d9_D2_H3K27me3 ../results/picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D1_H3K27me3 ../results/picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D2_H3K27me3 ../results/picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bam -genome hg38
    
    for sample in p600_601_d12_D1_input p600_601_d9_D2_input p602_d8_D1_input p602_d8_D2_input p600_601_d12_D1_H3K4me3 p600_601_d9_D2_H3K4me3 p602_d8_D1_H3K4me3 p602_d8_D2_H3K4me3 p600_601_d12_D1_H3K27me3 p600_601_d9_D2_H3K27me3 p602_d8_D1_H3K27me3 p602_d8_D2_H3K27me3; do
        makeUCSCfile ${sample} -bigWig /home/jhuang/REFs/hg38.chromSizes -o auto -style chipseq    #-pseudo 1 -norm 1e7 -normLength 100 -fsize 1
        mv ${sample}/${sample}.ucsc.bigWig ../bigWigs/
    done
    #Note that normLength should possibly read the fragmentation length from phantompeakqualtools. Easier to set all libraries to the default value: 100.
    #To normalize to a fragment length of 150, you would use -normLength 150.
    #To turn off normalization, you would use -normLength 0.
    
    # -- not necessary any more: using MACS2 and SICER instead of using findPeaks ------------------------> go STEPs 4.1 and 5.1.
    # #factor (transcription factor ChIP-Seq, uses -center, output: peaks.txt,  default)
    # #histone (histone modification ChIP-Seq, region based, uses -region -size 500 -L 0, regions.txt)
    # for sample in p601_d8_D1 p601_d8_D2 p604_d8_D1 p604_d8_D2; do
    #   #Finding peaks of size 1000, no closer than 2000
    #   findPeaks ${sample}_H3K4me3 -style factor -size 1000  -o auto -i ${sample}_input
    #   #-minDist <#> (minimum distance between peaks, default: peak size x2)
    #   #findPeaks ${sample}_H3K27me3 -style histone -region -size 3000 -minDist 5000 -o auto -i ${sample}_input
    #   #findPeaks ${sample}_H3K27ac -style factor -size 200 -minDist 200 -o auto -i ${sample}_input
    #   #findPeaks ${sample}_H3K4me1 -style histone -region -size 1000 -minDist 2500 -o auto -i ${sample}_input
    # done
    
    ./p601_d8_D1_H3K4me3/peaks.txt
    ./p601_d8_D2_H3K4me3/peaks.txt
    ./p604_d8_D1_H3K4me3/peaks.txt
    ./p604_d8_D2_H3K4me3/peaks.txt
    
    ./p601_d8_D1_H3K27me3/regions.txt
    ./p601_d8_D2_H3K27me3/regions.txt
    ./p604_d8_D1_H3K27me3/regions.txt
    ./p604_d8_D2_H3K27me3/regions.txt
    
    for dir in p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3; do
    awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/peaks.txt > ${dir}_peaks.bed
    grep -v "#" ${dir}_peaks.bed | sort -k1,1 -k2,2n > ${dir}_sorted_peaks.bed
    done
    
    for dir in p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3; do
    awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/regions.txt > ${dir}_regions.bed
    grep -v "#" ${dir}_regions.bed | sort -k1,1 -k2,2n > ${dir}_sorted_regions.bed
    done
    
    #DEBUG: why the bam files so small?
    makeTagDirectory NHDF-Ad_Control_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_Control_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r2.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K27ac_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K27ac_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r2.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K4me1_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K4me1_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r2.dedup.sorted.bam -genome hg38
    
    NHDF-Ad_Control_r1 NHDF-Ad_Control_r2 NHDF-Ad_H3K27ac_r1 NHDF-Ad_H3K27ac_r2 NHDF-Ad_H3K4me1_r1 NHDF-Ad_H3K4me1_r2
    
    > (myperl) environments for HOMER, ~/Tools/diffreps/bin/diffReps.pl, MACS2, ~/Tools/SICER1.1/SICER/SICER.sh
    

2.5. diffbind (NOT_USED) https://hbctraining.github.io/Intro-to-ChIPseq/lessons/08_diffbind_differential_peaks.html #-->identifying statistically significantly differentially bound sites https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02686-y

 #samplesheet.csv
 #SampleID, Condition, Peaks, Counts
 #Sample1, Control, sample1_peaks.bed, sample1.bam
 #Sample2, Treatment, sample2_peaks.bed, sample2.bam
 library(DiffBind)
 samples <- dba(sampleSheet="samplesheet.csv")

3.0. combine the diffReps.pl and HOMER annotatePeaks.pl, so we can use for hg38 and mm10 and so on (IMPORTANT, DON'T NEED --gname hg38, run diffReps.pl --> bed_file --> annotatePeaks.pl it with HOMER)

    #Dynamic regions were defined as MACS (H3K4me3, H3K27ac) or SICER (H3K4me1, H3K27me3) peaks overlapping significantly (≥ 2-fold change, adjusted P-value ≤ 0.05) up- or down-regulated differentially enriched regions from diffReps in the three pairwise comparisons WAC vs mock, WA314 vs mock and WAC vs WA314.

    #STEP1
    #--> not given "--gname hg38"
    ## Step4: Annotate differential sites.
    #unless($noanno or $gname eq ''){
    #        `region_analysis.pl -i $report -r -d refseq -g $gname`;
    #}
    ## Step5: Look for hotspots.
    #unless($nohs){
    #        my $hotspot = $report . '.hotspot';
    #        `findHotspots.pl -d $report -o $hotspot`;
    #}
    ~/Tools/diffreps/bin/diffReps.pl -tr ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bed -co ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed  --report output_results  --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd sharp --noanno

    ~/Tools/diffreps/bin/diffReps.pl -tr ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bed -co ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed  --report p602_vs_p600_601_H3K4me3_diff_out  --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd sharp --noanno
    ~/Tools/diffreps/bin/diffReps.pl --treatment ./results/picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bed ./results/picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bed --btr ./results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bed ./results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bed --control ./results/picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bed ./results/picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bed --bco ./results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bed ./results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bed  --report p602_vs_p600_601_H3K27me3_diff_out  --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd broad --mode block --window 10000 --gap 30000 --noanno

    #I have utilized 'diffReps' for this analysis because the methods I previously mentioned in my last email were not effective in identifying sites with statistically significant differential binding. 'diffReps' appears to be a more suitable tool for our needs. Below is the command I used. It's important to note that 'diffReps' typically requires replicates as input and outputs a set of peaks/regions.

    #STEP2
    #replace Chr to '#Chr'
    grep -v "#" output_results | sort -k1,1 -k2,2n > output_results_
    awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' output_results_ > H3K4me3.bed
    #grep -v "#" H3K4me3.bed | sort -k1,1 -k2,2n > H3K4me3_sorted_peaks.bed

    grep -v "#" p602_vs_p600_601_H3K4me3_diff_out | sort -k1,1 -k2,2n > p602_vs_p600_601_H3K4me3_diff_out_
    awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' p602_vs_p600_601_H3K4me3_diff_out_ > p602_vs_p600_601_H3K4me3.bed
    grep -v "#" p602_vs_p600_601_H3K27me3_diff_out | sort -k1,1 -k2,2n > p602_vs_p600_601_H3K27me3_diff_out_
    awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' p602_vs_p600_601_H3K27me3_diff_out_ > p602_vs_p600_601_H3K27me3.bed

    #STEP3 (under myperl) peak calling macs2 for narrow peaks, CISER for broad peaks!
    #process the output of diffReps.pl to BED file.
    annotatePeaks.pl H3K4me3.bed hg38 > H3K4me3_annotated_peaks.txt

    annotatePeaks.pl p602_vs_p600_601_H3K4me3.bed hg38 > p602_vs_p600_601_H3K4me3_annotated_peaks.txt
    annotatePeaks.pl p602_vs_p600_601_H3K27me3.bed hg38 > p602_vs_p600_601_H3K27me3_annotated_peaks.txt
    (head -n 1 p602_vs_p600_601_H3K4me3_annotated_peaks.txt && tail -n +2 p602_vs_p600_601_H3K4me3_annotated_peaks.txt | sort -t "_" -k 3 -n) > p602_vs_p600_601_H3K4me3_annotated_peaks_.txt
    (head -n 1 p602_vs_p600_601_H3K27me3_annotated_peaks.txt && tail -n +2 p602_vs_p600_601_H3K27me3_annotated_peaks.txt | sort -t "_" -k 3 -n) > p602_vs_p600_601_H3K27me3_annotated_peaks_.txt
    ~/Tools/csv2xls-0.4/csv_to_xls.py p602_vs_p600_601_H3K4me3_annotated_peaks_.txt -d$'\t' -o p602_vs_p600_601_H3K4me3_annotated_peaks.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py p602_vs_p600_601_H3K27me3_annotated_peaks_.txt -d$'\t' -o p602_vs_p600_601_H3K27me3_annotated_peaks.xls

4.0. combine macs2 to getDifferentialPeaksReplicates.pl

    replace the initial peak identification by using your MACS2 output.

    #http://homer.ucsd.edu/homer/ngs/diffExpression.html
    #getDifferentialPeaksReplicates.pl = findPeaks + annotatePeaks.pl + getDiffExpression.pl 
    #annotatePeaks.pl tss hg38 -raw -d H3K4me3-Mock-rep1/ H3K4me3-Mock-rep2/ H3K4me3-WNT-rep1/ H3K4me3-WNT-rep3/ > countTable.peaks.txt

    Here's an outline of how we might be able to replace the initial peak identification by using your MACS2 output.

    #TODO: using MACS call peaks of the data H3K27ac.

4.1. MACS2 peak calling

  # -- macs2 --> bed --> annotatePeaks.pl
  conda activate homer
  cd macs2
  macs2 callpeak -t ../results/picard/V_8_3_1_p600_601_d12_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bam -f BAM -g hs -n p600_601_d12_D1 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_3_2_p600_601_d9_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bam -f BAM -g hs -n p600_601_d9_D2 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_4_2_p602_d8_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bam -f BAM -g hs -n p602_d8_D1 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_4_1_p602_d8_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bam -f BAM -g hs -n p602_d8_D2 -q 0.05

  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p600_601_d12_D1_peaks.narrowPeak > p600_601_d12_D1_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p600_601_d9_D2_peaks.narrowPeak > p600_601_d9_D2_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p602_d8_D1_peaks.narrowPeak > p602_d8_D1_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p602_d8_D2_peaks.narrowPeak > p602_d8_D2_peaks.bed

  #annotatePeaks.pl p601_d8_D1_peaks.bed hg38 > p601_d8_D1_annotated_peaks.txt
  #annotatePeaks.pl p601_d8_D2_peaks.bed hg38 > p601_d8_D2_annotated_peaks.txt
  #annotatePeaks.pl p604_d8_D1_peaks.bed hg38 > p604_d8_D1_annotated_peaks.txt
  #annotatePeaks.pl p604_d8_D2_peaks.bed hg38 > p604_d8_D2_annotated_peaks.txt

4.2. Convert your MACS2 peaks to HOMER-compatible format. You can do this manually or with a script. For example:

It's possible to use more information from the MACS2 output file to create a more informative peaks.txt file for HOMER. However, it's important to note that some information that HOMER needs for its differential peak analysis is not available in the MACS2 output (such as Normalized Tag Count, Control Tags, and others). But we can certainly map more of the available MACS2 columns to the corresponding HOMER columns.

  #The following awk command can be used to convert more MACS2 information into the HOMER format:

  cd macs2
  #awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"Peak_"NR,$5,$6,$7,$8,$9,$10}' macs2_peaks.bed > macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p600_601_d12_D1_peaks.xls > p600_601_d12_D1_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p600_601_d9_D2_peaks.xls > p600_601_d9_D2_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p602_d8_D1_peaks.xls > p602_d8_D1_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p602_d8_D2_peaks.xls > p602_d8_D2_macs2_peaks.txt

  This command will:

      * Skip the header line (NR > 1)
      * Map the MACS2 peak name ($10) to the HOMER PeakID
      * Map the MACS2 chromosome, start, and end ($1, $2, $3) to the HOMER chr, start, end
      * Use a placeholder "+" for the HOMER strand
      * Use a placeholder "0" for the HOMER Normalized Tag Count and Focus Ratio
      * Map the MACS2 pileup ($6) to the HOMER findPeaks Score and Total Tags
      * Use a placeholder "0" for the HOMER Control Tags
      * Map the MACS2 fold_enrichment ($8) to the HOMER Fold Change vs Control
      * Map the MACS2 abs_summit ($5) to the HOMER p-value vs Control
      * Map the MACS2 -log10(qvalue) ($9) to the HOMER Fold Change vs Local
      * Use a placeholder "0" for the HOMER p-value vs Local and Clonal Fold Change

  This script is limited by the differences in the information provided by MACS2 and HOMER. While it makes use of as much information as possible from the MACS2 output, some columns in the HOMER format still have to be filled with placeholder values.

4.3. Associate the converted peak files with their respective tag directories. In HOMER, peak files can be associated with a tag directory by placing them in the tag directory with the filename "peaks.txt".

  #mv homer/p600_601_d12_D1_H3K4me3/peaks.txt homer/p600_601_d12_D1_H3K4me3/peaks_raw.txt
  #mv homer/p600_601_d9_D2_H3K4me3/peaks.txt homer/p600_601_d9_D2_H3K4me3/peaks_raw.txt
  #mv homer/p602_d8_D1_H3K4me3/peaks.txt homer/p602_d8_D1_H3K4me3/peaks_raw.txt
  #mv homer/p602_d8_D2_H3K4me3/peaks.txt homer/p602_d8_D2_H3K4me3/peaks_raw.txt
  cp macs2/p600_601_d12_D1_macs2_peaks.txt homer/p600_601_d12_D1_H3K4me3/peaks.txt  
  cp macs2/p600_601_d9_D2_macs2_peaks.txt homer/p600_601_d9_D2_H3K4me3/peaks.txt    
  cp macs2/p602_d8_D1_macs2_peaks.txt homer/p602_d8_D1_H3K4me3/peaks.txt
  cp macs2/p602_d8_D2_macs2_peaks.txt homer/p602_d8_D2_H3K4me3/peaks.txt
  #33896 p600_601_d12_D1_peaks.bed
  #47977 p600_601_d9_D2_peaks.bed
  #59049 p602_d8_D1_peaks.bed
  #33730 p602_d8_D2_peaks.bed
  #CHECK and DELETE headers including "name ..." in the new peaks.txt files!

  #Repeat this for each of your tag directories.

4.4. The program getDifferentialPeaksReplicates will essentially perform 3 steps, in the step 2 was modified.

First, it will pool the target tag directories and input directories separately into pooled experiments and perform an initial peak identification (using findPeaks). Pooling the experiments is generally more sensitive than trying to merge the individual peak files coming from each experiment (although this can be done using the "-use " option if each directory already has a peak file associated with it). Next, it will quantify the reads at the initial putative peaks across each of the target and input tag directories using annotatePeaks.pl. Finally, it calls getDiffExpression.pl and ultimately passes these values to the R/Bioconductor package DESeq2 to calculate enrichment values for each peak, returning only those peaks that pass a given fold enrichment (default: 2-fold) and FDR cutoff (default 5%). We can run getDifferentialPeaksReplicates.pl with the -use option to specify that the provided peaks should be used instead of calling findPeaks:

        #-- Successful modification of the script getDifferentialPeaksReplicates.pl (Explain how to archieve getDifferentialPeaksReplicates.pl, only for DEBUG, for calculation directly run getDifferentialPeaksReplicates.pl!) --
        #The -d parameter in the mergePeaks function in HOMER is used to specify the maximum distance between peak centers
        #change Max distance to merge to 30000 bp in getDifferentialPeaksReplicates.pl
        #mergePeaks -d 500 [100,1000] temp_sorted | sort

        #conda list homer  #4.11
        mergePeaks -d 2000 p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt > mergePeaks_res1.txt  # get 37569 records 
        mergePeaks -d 2000 p602_d8_D1_H3K4me3/peaks.txt p602_d8_D2_H3K4me3/peaks.txt > mergePeaks_res2.txt  # get 44076 records

  #(homer) jhuang@hamburg:~/DATA/Data_Denise_LT_DNA_Bindung/results_chipseq_histone_hg38/H3K4me3_H3K27ac__H3K27me3_H3K9me3/homer$
  #getDifferentialPeaksReplicates.pl -use <tag_directory1>/peaks.txt,<tag_directory2>/peaks.txt ...
  #NOTE that the H3K4me3 has zu viele peaks, we can use the slightly modified getDifferentialPeaksReplicates_narrow.pl, "-d 2000"
  ./getDifferentialPeaksReplicates_narrow.pl -t p600_601_d12_D1_H3K4me3 p600_601_d9_D2_H3K4me3 -i p600_601_d12_D1_input p600_601_d9_D2_input -genome hg38 -use peaks.txt -style histone > p600_601_d9d12_H3K4me3_macs2_peaks.txt
  #p600_601_d12_D1_H3K4me3/peaks.txt    p600_601_d9_D2_H3K4me3/peaks.txt    Total   Name
  # X   11530   p600_601_d9_D2_H3K4me3/peaks.txt
  #X        2158    p600_601_d12_D1_H3K4me3/peaks.txt
  #X    X   23881   p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt
  # Using DESeq2 to calculate differential expression/enrichment...
  # Output Stats input vs. target:
  #     Total Genes: 37569
  #     Total Up-regulated in target vs. input: 26340 (70.111%) [log2fold>1, FDR<0.05]
  #     Total Dn-regulated in target vs. input: 1 (0.003%) [log2fold<-1, FDR<0.05]

  ./getDifferentialPeaksReplicates_narrow.pl -t p602_d8_D1_H3K4me3 p602_d8_D2_H3K4me3 -i p602_d8_D1_input p602_d8_D2_input -genome hg38 -use peaks.txt -style histone > p602_d8_H3K4me3_macs2_peaks.txt
  #p602_d8_D1_H3K4me3/peaks.txt p602_d8_D2_H3K4me3/peaks.txt    Total   Name
  # X   270 p602_d8_D2_H3K4me3/peaks.txt
  #X        24939   p602_d8_D1_H3K4me3/peaks.txt
  #X    X   18867   p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt
  # Using DESeq2 to calculate differential expression/enrichment...
  # Output Stats input vs. target:
  #     Total Genes: 44076
  #     Total Up-regulated in target vs. input: 24203 (54.912%) [log2fold>1, FDR<0.05]
  #     Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]

  #"-d 1000"
  #p600_601_d12_D1_H3K4me3/peaks.txt    p600_601_d9_D2_H3K4me3/peaks.txt    Total   Name
  # X   14165   p600_601_d9_D2_H3K4me3/peaks.txt
  #X        3910    p600_601_d12_D1_H3K4me3/peaks.txt
  #X    X   26162   p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt
  #Using DESeq2 to calculate differential expression/enrichment...
  # Output Stats input vs. target:
  #     Total Genes: 44237
  #     Total Up-regulated in target vs. input: 32706 (73.934%) [log2fold>1, FDR<0.05]
  #     Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]

  #p602_d8_D1_H3K4me3/peaks.txt p602_d8_D2_H3K4me3/peaks.txt    Total   Name
  #    X    2825    p602_d8_D2_H3K4me3/peaks.txt
  #X        29258   p602_d8_D1_H3K4me3/peaks.txt
  #X    X   20829   p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt
  #Using DESeq2 to calculate differential expression/enrichment...
  #Output Stats input vs. target:
  #    Total Genes: 52912
  #    Total Up-regulated in target vs. input: 32484 (61.393%) [log2fold>1, FDR<0.05]
  #    Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]

4.5. Draw plots

  #grep "p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt" p600_601_d9d12_H3K4me3_macs2_peaks.txt | wc -l
  #grep -v "p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt" p600_601_d9d12_H3K4me3_macs2_peaks.txt | grep "p600_601_d12_D1_H3K4me3/peaks.txt" | wc -l
  #grep -v "p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt" p600_601_d9d12_H3K4me3_macs2_peaks.txt | grep "p600_601_d9_D2_H3K4me3/peaks.txt" | wc -l

  #grep "p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt" p602_d8_H3K4me3_macs2_peaks.txt | wc -l
  #grep -v "p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt" p602_d8_H3K4me3_macs2_peaks.txt | grep "p602_d8_D1_H3K4me3/peaks.txt" | wc -l
  #grep -v "p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt" p602_d8_H3K4me3_macs2_peaks.txt | grep "p602_d8_D2_H3K4me3/peaks.txt" | wc -l

  import matplotlib.pyplot as plt
  from matplotlib_venn import venn2
  venn2(subsets=(1808, 5774, 25124), set_labels=('Donor 1', 'Donor 2'))
  plt.title('Peaks between p600+p601 d9 or d12 H3K4me3 Donors')
  plt.xlabel('Number of Elements')
  plt.ylabel('')
  plt.savefig('Peak_Consistency_Between_p600+p601_d9d12_H3K4me3_Donors.png', dpi=300, bbox_inches='tight')

  import matplotlib.pyplot as plt
  from matplotlib_venn import venn2
  venn2(subsets=(9100, 2761, 20623), set_labels=('Donor 1', 'Donor 2'))
  plt.title('Peaks between p602 d8 H3K4me3 Donors')
  plt.xlabel('Number of Elements')
  plt.ylabel('')
  plt.savefig('Peak_Consistency_Between_p602_d8_H3K4me3_Donors.png', dpi=300, bbox_inches='tight')

  awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p600_601_d9d12_H3K4me3_macs2_peaks.txt > p600+p601_d9d12_H3K4me3_macs2_peaks.bed
  awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p602_d8_H3K4me3_macs2_peaks.txt > p602_d8_H3K4me3_macs2_peaks.bed
  ~/Tools/csv2xls-0.4/csv_to_xls.py p600_601_d9d12_H3K4me3_macs2_peaks.txt -d$'\t' -o p600+p601_d9d12_H3K4me3_macs2_peaks.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py p602_d8_H3K4me3_macs2_peaks.txt -d$'\t' -o p602_d8_H3K4me3_macs2_peaks.xls

4.6. identifying statistically significantly differentially bound sites based on evidence of binding affinity (measured by differences in read densities)

  ##http://homer.ucsd.edu/homer/ngs/peaksReplicates.html
  #getDifferentialPeaksReplicates.pl -t Oct4-r1/ Oct4-r2/ -b Sox2-r1/ Sox2-r2/ -i Input-r1/ Input-r2/ > outputPeaks.txt
  ##first get TSS regions using annotatePeaks.pl
  #annotatePeaks.pl tss hg38 > tss.txt
  #getDifferentialPeaksReplicates.pl -t H3K27ac-WNT-r1/ H3K27ac-WNT-r2/ -b H3K27ac-notx-r1 H3K27ac-notx-r2/ -p tss.txt > outputPeaks.txt

  #- Donor I p602 (d8) vs p600+601 (d12) H3K4me3
  #- Donor II p602 (d8) vs p600+601 (d9) H3K4me3
  ./getDifferentialPeaksReplicates_narrow.pl -t p602_d8_D1_H3K4me3 p602_d8_D2_H3K4me3 -b p600_601_d12_D1_H3K4me3 p600_601_d9_D2_H3K4me3 -genome hg38 -use peaks.txt -style histone > p602_d8_vs_p600_601_d9d12_H3K4me3_diff_peaks.txt
  #./getDifferentialPeaksReplicates_narrow.pl -t p602_d8_D1_H3K4me3 -b p600_601_d12_D1_H3K4me3 -genome hg38 -use peaks.txt -style histone > p602_d8_vs_p600_601_d12_D1_H3K4me3_diff_peaks.txt

5.1. SICER peak calling (under env myperl)

  # -- SICER --> bed --> annotatePeaks.pl
  mkdir sicer; cd sicer;

  ln -s ../results/picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bed .
  ln -s ../results/picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bed .
  ln -s ../results/picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bed .
  ln -s ../results/picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bed .
  ln -s ../results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bed .
  ln -s ../results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bed .
  ln -s ../results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bed .
  ln -s ../results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bed .

  #/home/jhuang/Tools/SICER1.1/SICER/SICER.sh [InputDir] [bed file] [control file] [OutputDir] [Species] [redundancy threshold] [window size (bp)] [fragment size] [effective genome fraction] [gap size (bp)] [FDR]
  # 10000 is window size, 30000 is the gap size, 160 is the fragment size
  mkdir p600_601_d12_D1 p600_601_d9_D2 p602_d8_D1 p602_d8_D2
  ~/Tools/SICER1.1/SICER/SICER.sh . V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bed V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bed p600_601_d12_D1 hg38 1 10000 160 0.74 30000 0.01;
  ~/Tools/SICER1.1/SICER/SICER.sh . V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bed V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bed p600_601_d9_D2 hg38 1 10000 160 0.74 30000 0.01;
  ~/Tools/SICER1.1/SICER/SICER.sh . V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bed V_8_4_2_p602_d8_D1_input.dedup.sorted.bed p602_d8_D1 hg38 1 10000 160 0.74 30000 0.01;
  ~/Tools/SICER1.1/SICER/SICER.sh . V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bed V_8_4_1_p602_d8_D2_input.dedup.sorted.bed p602_d8_D2 hg38 1 10000 160 0.74 30000 0.01;
  #sicer -t treatment.bed -c control.bed -s hg38 -w 200 -rt 1 -f 150 -egf 0.74 -fdr 0.01 -g 600 -e 1000

  #----------- to STEP 5.2-3. ----------->
  #TODO: - check if the peak calling works well in IGV!
  # - call peaks for H3K27me1, adjust the SICER-parameters!
  # - transform them peaks.txt of HOMER, and call getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K27me3_sicer_peaks.txt
  getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K27me3_sicer_peaks.txt

  #Note that histone using 'cat file1.bed file2.bed | sort -k1,1 -k2,2n | bedtools merge > merged.bed'
  #Note that factor using HOMER getDifferentialPeaksReplicates from begining: "mergePeaks.pl --> DESeq recheck"!

  mergePeaks sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed -d 10000 > mergedPeaks1.txt

  cat sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed | sort -k1,1 -k2,2n | bedtools merge > merged_p601_d8.bed
  #4957 (4388, 4822)

  awk 'BEGIN{OFS="\t"; print "PeakID\tchr\tstart\tend\tstrand"} {print "Peak-"NR, $1, $2, $3, "."}' merged_p601_d8.bed > homer_peaks.bed
  #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!!!!!!: The bedtools merge command does not generate output in the HOMER peak format directly. However, you can use a combination of Unix command line tools (e.g., awk, sort) to convert bedtools output into HOMER format.
  #bedtools merge -i input.bed > merged.bed
  #awk 'BEGIN{OFS="\t"; print "PeakID\tchr\tstart\tend\tstrand"} {print "Peak-"NR, $1, $2, $3, "."}' merged.bed > homer_peaks.bed

5.2-3. Convert your SICER peaks to HOMER-compatible format, and replace them in homer/${sample}/peaks.txt. You can do this manually or with a script.

- The following awk command can be used to convert more MACS2 information into the HOMER format:

  #mv homer/p601_d8_D1_H3K27me3/regions.txt homer/p601_d8_D1_H3K27me3/regions_raw.txt
  #mv homer/p601_d8_D2_H3K27me3/regions.txt homer/p601_d8_D2_H3K27me3/regions_raw.txt
  #mv homer/p604_d8_D1_H3K27me3/regions.txt homer/p604_d8_D1_H3K27me3/regions_raw.txt
  #mv homer/p604_d8_D2_H3K27me3/regions.txt homer/p604_d8_D2_H3K27me3/regions_raw.txt

  #NOTE that we should name the file as peaks.txt, since the Input-samples contain only peaks.txt.
  awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p600_601_d12_D1/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p600_601_d12_D1_H3K27me3/peaks.txt
  awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p600_601_d9_D2/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p600_601_d9_D2_H3K27me3/peaks.txt
  awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p602_d8_D1/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p602_d8_D1_H3K27me3/peaks.txt
  awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p602_d8_D2/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p602_d8_D2_H3K27me3/peaks.txt

  #The format is similar to the following.
  #chr1-83000-89999        chr1    83000   89999   +       21      0       0       21      27      0       5.623763510806874e-10   0       0       0
  #chr1-859000-865999      chr1    859000  865999  +       23      0       0       23      28      0       3.501056563888787e-11   0       0       0

  This command will:
      * PeakID: Format it as "chr-start-end".
      * chr: Same as SICER.
      * start: Same as SICER.
      * end: Same as SICER.
      * strand: SICER doesn't provide strand information, we could fill with a default value '+'. 
      * Normalized Tag Count: Same as the normalized read count column in SICER.
      * focus ratio: SICER doesn't provide this, we could fill with a default value '0'.
      * findPeaks Score: Corresponds to Fold Change or FDR column in SICER.
      * Total Tags: SICER doesn't provide this, we could fill with a default value '0'.
      * Control Tags (normalized to IP Experiment): Same as Clonal reads or local background column in SICER.
      * Fold Change vs Control: SICER doesn't provide this, we could fill with a default value '0'.
      * p-value vs Control: Same as the p-value column in SICER.
      * Fold Change vs Local: SICER doesn't provide this, we could fill with a default value '0'.
      * p-value vs Local: SICER doesn't provide this, we could fill with a default value '0'.
      * Clonal Fold Change: SICER doesn't provide this, we could fill with a default value '0'.

5.4. The program getDifferentialPeaksReplicates_broad.pl will essentially perform 3 steps, in the step 2 was modified.

    conda activate myperl
    cd homer
    #diff /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl /home/jhuang/homer/bin/getDifferentialPeaksReplicates.pl
    #vim /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl #max_distance_to_merge at line 203 to 133000, "-d <#>"

          #-- Successful modification of the script getDifferentialPeaksReplicates.pl (Explain how to archieve getDifferentialPeaksReplicates.pl, only for DEBUG, for calculation directly run getDifferentialPeaksReplicates.pl!) --
          #The -d parameter in the mergePeaks function in HOMER is used to specify the maximum distance between peak centers
          #change Max distance to merge to 30000 bp in getDifferentialPeaksReplicates.pl
          #mergePeaks -d 30000 [] temp_sorted | sort

          #conda list homer  #4.11
          mergePeaks p600_601_d12_D1_H3K27me3/peaks.txt p600_601_d9_D2_H3K27me3/peaks.txt > mergePeaks_res.txt

          python3 update_header.py

          cat p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt > merged_peaks.txt
          awk '{print $2 "\t" $3 "\t" $4 "\t" $1}' merged_peaks.txt | sort -k1,1 -k2,2n | bedtools merge -d 1000 > bedtools_res.txt # get 36579 records using 'bedtools merge'
          python3 adjust_mergePeaks_res.py # limiting the start- and end-positons of mergePeaks_res.txt to the ranges in bedtools_res.txt, last very long!

          #check if the results are correct
          cut -d$'\t' -f2-4 filtered_mergePeaks_res.txt > control1
          diff control1 bedtools_res.txt
          #NOTE that the script getDifferentialPeaksReplicates_broad.pl is modified in the step2; the script used "update_header.py" and "adjust_mergedPeaks_res.py"
    #-d 35000
    ./getDifferentialPeaksReplicates_broad.pl -t p600_601_d12_D1_H3K27me3 p600_601_d9_D2_H3K27me3 -i p600_601_d12_D1_input p600_601_d9_D2_input -genome hg38 -use peaks.txt -style histone > p600_601_d9d12_H3K27me3_sicer_regions.txt
    ./getDifferentialPeaksReplicates_broad.pl -t p602_d8_D1_H3K27me3 p602_d8_D2_H3K27me3 -i p602_d8_D1_input p602_d8_D2_input -genome hg38 -use peaks.txt -style histone > p602_d8_H3K27me3_sicer_regions.txt

    #p600_601_d12_D1_H3K27me3/peaks.txt p600_601_d9_D2_H3K27me3/peaks.txt   Total   Name
    #   X   1821    p600_601_d9_D2_H3K27me3/peaks.txt
    #X      1306    p600_601_d12_D1_H3K27me3/peaks.txt
    #X  X   3981    p600_601_d12_D1_H3K27me3/peaks.txt|p600_601_d9_D2_H3K27me3/peaks.txt
    #Using DESeq2 to calculate differential expression/enrichment...
    #Output Stats input vs. target:
    #    Total Genes: 4431
    #    Total Up-regulated in target vs. input: 2502 (56.466%) [log2fold>1, FDR<0.05]
    #    Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
    #    
    #p602_d8_D1_H3K27me3/peaks.txt  p602_d8_D2_H3K27me3/peaks.txt   Total   Name
    #   X   1101    p602_d8_D2_H3K27me3/peaks.txt
    #X      1602    p602_d8_D1_H3K27me3/peaks.txt
    #X  X   3668    p602_d8_D1_H3K27me3/peaks.txt|p602_d8_D2_H3K27me3/peaks.txt
    #Using DESeq2 to calculate differential expression/enrichment...
    #Output Stats input vs. target:
    #    Total Genes: 4216
    #    Total Up-regulated in target vs. input: 2113 (50.119%) [log2fold>1, FDR<0.05]
    #    Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]

5.5. Draw plots

    grep "p600_601_d12_D1_H3K27me3/peaks.txt|p600_601_d9_D2_H3K27me3/peaks.txt" p600_601_d9d12_H3K27me3_sicer_regions.txt | wc -l  # 2128
    grep "p600_601_d12_D1_H3K27me3/peaks.txt" p600_601_d9d12_H3K27me3_sicer_regions.txt | wc -l                                # 2162 - 2128 = 34
    grep "p600_601_d9_D2_H3K27me3/peaks.txt" p600_601_d9d12_H3K27me3_sicer_regions.txt | wc -l                                 # 2468 - 2128 = 340
    # 340+34+2128 = 2502
    grep "p602_d8_D1_H3K27me3/peaks.txt|p602_d8_D2_H3K27me3/peaks.txt" p602_d8_H3K27me3_sicer_regions.txt | wc -l  # 1840
    grep "p602_d8_D1_H3K27me3/peaks.txt" p602_d8_H3K27me3_sicer_regions.txt | wc -l                                # 1940 - 1840 = 100
    grep "p602_d8_D2_H3K27me3/peaks.txt" p602_d8_H3K27me3_sicer_regions.txt | wc -l                                # 2013 - 1840 = 173
    # 1840 + 100 + 173 = 2113

    import matplotlib.pyplot as plt
    from matplotlib_venn import venn2
    venn2(subsets=(34, 340, 2128), set_labels=('Donor 1', 'Donor 2'))
    plt.title('Regions between p602 d8 H3K27me3 Donors')
    plt.xlabel('Number of Elements')
    plt.ylabel('')
    plt.savefig('Region_Consistency_Between_p600+p601_d9d12_H3K27me3_Donors.png', dpi=300, bbox_inches='tight')

    import matplotlib.pyplot as plt
    from matplotlib_venn import venn2        
    venn2(subsets=(100, 173, 1840), set_labels=('Donor 1', 'Donor 2'))
    plt.title('Regions between p602 d8 H3K27me3 Donors')
    plt.xlabel('Number of Elements')
    plt.ylabel('')
    plt.savefig('Region_Consistency_Between_p602_d8_H3K27me3_Donors.png', dpi=300, bbox_inches='tight')

    awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p600_601_d9d12_H3K27me3_sicer_regions.txt > p600+p601_d9d12_H3K27me3_sicer_regions.bed
    awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p602_d8_H3K27me3_sicer_regions.txt > p602_d8_H3K27me3_sicer_regions.bed
    ~/Tools/csv2xls-0.4/csv_to_xls.py p600_601_d9d12_H3K27me3_sicer_regions.txt -d$'\t' -o p600+p601_d9d12_H3K27me3_sicer_regions.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py p602_d8_H3K27me3_sicer_regions.txt -d$'\t' -o p602_d8_H3K27me3_sicer_regions.xls

5.6. identifying statistically significantly differentially bound sites based on evidence of binding affinity (measured by differences in read densities)

    #- Donor I p602 (d8) vs p600+601 (d12) H3K27me3
    #- Donor II p602 (d8) vs p600+601 (d9) H3K27me3        
    ./getDifferentialPeaksReplicates_broad.pl -t p602_d8_D1_H3K27me3 p602_d8_D2_H3K27me3 -b p600_601_d12_D1_H3K27me3 p600_601_d9_D2_H3K27me3 -genome hg38 -use peaks.txt -style histone > p602_d8_vs_p600_601_d9d12_H3K27me3_regions.txt

7.0. LOGs: result examples using different -d values

    # -- Max distance to merge: 35000 bp --
    getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_using_mergePeaks_d35000.txt

    Merging peaks... 
    Comparing p601_d8_D1_H3K4me3/peaks.txt (32453 total) and p601_d8_D1_H3K4me3/peaks.txt (32453 total)
    Comparing p601_d8_D1_H3K4me3/peaks.txt (32453 total) and p601_d8_D2_H3K4me3/peaks.txt (37269 total)
    Comparing p601_d8_D2_H3K4me3/peaks.txt (37269 total) and p601_d8_D1_H3K4me3/peaks.txt (32453 total)
    Comparing p601_d8_D2_H3K4me3/peaks.txt (37269 total) and p601_d8_D2_H3K4me3/peaks.txt (37269 total)

    p601_d8_D1_H3K4me3/peaks.txt    p601_d8_D2_H3K4me3/peaks.txt    Total   Name
        X   2830    p601_d8_D2_H3K4me3/peaks.txt
    X       714 p601_d8_D1_H3K4me3/peaks.txt
    X   X   12786   p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt

        Step2: Quantifying reads across target/background/input tag directories
        Peak File Statistics:
            Total Peaks: 16324

        Step3: Calling R for differential enrichment statistics (-DESeq2)
        Autodetecting input file format...
        Autodetected annotatePeaks.pl file
        Performing variance stabalization (rlog)...
        Using DESeq2 to calculate differential expression/enrichment...
        Output Stats input vs. target:
            Total Genes: 16324
            Total Up-regulated in target vs. input: 7193 (44.064%) [log2fold>1, FDR<0.05]
            Total Dn-regulated in target vs. input: 1319 (8.080%) [log2fold<-1, FDR<0.05]

    # -- Max distance to merge: 1000 bp --
    getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_d1000.txt
    getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks_d1000.txt

        Step1: Defining Putative Peak Set
    p601_d8_D1_H3K4me3/peaks.txt    p601_d8_D2_H3K4me3/peaks.txt    Total   Name
        X   10726   p601_d8_D2_H3K4me3/peaks.txt
    X       4883    p601_d8_D1_H3K4me3/peaks.txt
    X   X   23478   p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt
        Step3: Calling R for differential enrichment statistics (-DESeq2)
        Using DESeq2 to calculate differential expression/enrichment...
        Output Stats input vs. target:
            Total Genes: 39087
            Total Up-regulated in target vs. input: 28762 (73.585%) [log2fold>1, FDR<0.05]
            Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]

    p604_d8_D1_H3K4me3/peaks.txt    p604_d8_D2_H3K4me3/peaks.txt    Total   Name
        X   9806    p604_d8_D2_H3K4me3/peaks.txt
    X       4611    p604_d8_D1_H3K4me3/peaks.txt
    X   X   19586   p604_d8_D1_H3K4me3/peaks.txt|p604_d8_D2_H3K4me3/peaks.txt
        Using DESeq2 to calculate differential expression/enrichment...
        Output Stats input vs. target:
            Total Genes: 34003
            Total Up-regulated in target vs. input: 25135 (73.920%) [log2fold>1, FDR<0.05]
            Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]

    # -- Max distance to merge: 500 bp --
    vim /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl
    getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_d500.txt
    getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks_d500.txt

    p601_d8_D1_H3K4me3/peaks.txt    p601_d8_D2_H3K4me3/peaks.txt    Total   Name
        X   13537   p601_d8_D2_H3K4me3/peaks.txt
    X       8455    p601_d8_D1_H3K4me3/peaks.txt
    X   X   23009   p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt
        Using DESeq2 to calculate differential expression/enrichment...
        Output Stats input vs. target:
            Total Genes: 45001
            Total Up-regulated in target vs. input: 31206 (69.345%) [log2fold>1, FDR<0.05]
            Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]

    p604_d8_D1_H3K4me3/peaks.txt    p604_d8_D2_H3K4me3/peaks.txt    Total   Name
        X   12775   p604_d8_D2_H3K4me3/peaks.txt
    X       8797    p604_d8_D1_H3K4me3/peaks.txt
    X   X   18654   p604_d8_D1_H3K4me3/peaks.txt|p604_d8_D2_H3K4me3/peaks.txt
        Using DESeq2 to calculate differential expression/enrichment...
        Output Stats input vs. target:
            Total Genes: 40226
            Total Up-regulated in target vs. input: 27939 (69.455%) [log2fold>1, FDR<0.05]
            Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
    cp p601_d8_H3K4me3_macs2_peaks_d500.txt p601_d8_H3K4me3_macs2_peaks.txt
    cp p604_d8_H3K4me3_macs2_peaks_d500.txt p604_d8_H3K4me3_macs2_peaks.txt

    awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p601_d8_H3K4me3_macs2_peaks.txt > p601_d8_H3K4me3_macs2_peaks.bed
    awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p604_d8_H3K4me3_macs2_peaks.txt > p604_d8_H3K4me3_macs2_peaks.bed
    ~/Tools/csv2xls-0.4/csv_to_xls.py p601_d8_H3K4me3_macs2_peaks.txt -d$'\t' -o p601_d8_H3K4me3_macs2_peaks.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py p604_d8_H3K4me3_macs2_peaks.txt -d$'\t' -o p604_d8_H3K4me3_macs2_peaks.xls
    #----> Send to Denise!

    # -- Max distance to merge: 100 bp --
    getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_d100.txt
    getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks_d100.txt
    p601_d8_D1_H3K4me3/peaks.txt    p601_d8_D2_H3K4me3/peaks.txt    Total   Name
        X   25160   p601_d8_D2_H3K4me3/peaks.txt
    X       20344   p601_d8_D1_H3K4me3/peaks.txt
    X   X   12109   p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt
        Using DESeq2 to calculate differential expression/enrichment...
        Output Stats input vs. target:
            Total Genes: 57613
            Total Up-regulated in target vs. input: 31179 (54.118%) [log2fold>1, FDR<0.05]
            Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]

    p604_d8_D1_H3K4me3/peaks.txt    p604_d8_D2_H3K4me3/peaks.txt    Total   Name
        X   23330   p604_d8_D2_H3K4me3/peaks.txt
    X       19855   p604_d8_D1_H3K4me3/peaks.txt
    X   X   8512    p604_d8_D1_H3K4me3/peaks.txt|p604_d8_D2_H3K4me3/peaks.txt
        Using DESeq2 to calculate differential expression/enrichment...
        Output Stats input vs. target:
            Total Genes: 51697
            Total Up-regulated in target vs. input: 28436 (55.005%) [log2fold>1, FDR<0.05]
            Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]


    cp ./homer/p601_d8_D1_input/p601_d8_D1_input.ucsc.bigWig bigWigs
    cp ./homer/p601_d8_D2_input/p601_d8_D2_input.ucsc.bigWig bigWigs
    cp ./homer/p604_d8_D1_input/p604_d8_D1_input.ucsc.bigWig bigWigs
    cp ./homer/p604_d8_D2_input/p604_d8_D2_input.ucsc.bigWig bigWigs

    cp ./homer/p601_d8_D1_H3K4me3/p601_d8_D1_H3K4me3.ucsc.bigWig bigWigs
    cp ./homer/p601_d8_D2_H3K4me3/p601_d8_D2_H3K4me3.ucsc.bigWig bigWigs
    cp ./homer/p604_d8_D1_H3K4me3/p604_d8_D1_H3K4me3.ucsc.bigWig bigWigs
    cp ./homer/p604_d8_D2_H3K4me3/p604_d8_D2_H3K4me3.ucsc.bigWig bigWigs

    cp ./homer/p601_d8_D1_H3K27me3/p601_d8_D1_H3K27me3.ucsc.bigWig bigWigs
    cp ./homer/p601_d8_D2_H3K27me3/p601_d8_D2_H3K27me3.ucsc.bigWig bigWigs
    cp ./homer/p604_d8_D1_H3K27me3/p604_d8_D1_H3K27me3.ucsc.bigWig bigWigs
    cp ./homer/p604_d8_D2_H3K27me3/p604_d8_D2_H3K27me3.ucsc.bigWig bigWigs
    #----> Send to Denise!

    cp ./homer/p601_d8_D1_H3K27ac/p601_d8_D1_H3K27ac.ucsc.bigWig bigWigs
    cp ./homer/p601_d8_D2_H3K27ac/p601_d8_D2_H3K27ac.ucsc.bigWig bigWigs
    cp ./homer/p604_d8_D1_H3K27ac/p604_d8_D1_H3K27ac.ucsc.bigWig bigWigs
    cp ./homer/p604_d8_D2_H3K27ac/p604_d8_D2_H3K27ac.ucsc.bigWig bigWigs
  1. getDifferentialPeaksReplicates_broad.pl (code)

    #!/usr/bin/env perl
    use warnings;
    use lib "/home/jhuang/Tools/homer/bin";
    my $homeDir = "/home/jhuang/Tools/homer/";
    
    my $foldThresh = 2;
    my $fdrThresh = 0.05;
    my $peakFoldInput = 2;
    my $peakFdrInput = 0.001;
    my $style = "";
    
    sub printCMD {
        print STDERR "\n\tUsage: getDifferentialPeaksReplicates.pl [options] -t <IP tagdir1> [IP tagdir2] ...\n";
        print STDERR "\t                                         -b <background tagdir1> [background tagdir2] ...\n";
        print STDERR "\t                                         -i <Input tagdir1> [Input tagdir2] ...\n";
        print STDERR "\t\tNote: if input is provided, peaks will be called.\n";
        print STDERR "\n\tOptions:\n";
        #print STDERR "\t\t-F <#> (fold enrichment over bg, default: $foldThresh)\n";
        #print STDERR "\t\t-fdr <#> (FDR over input, default: $fdrThresh)\n";
        print STDERR "\t\t-f <#> (fold enrichment over bg, default: $foldThresh)\n";
        print STDERR "\t\t-q <#> (FDR over bg, default: $fdrThresh)\n";
        print STDERR "\t\t-fdr <#>, -F <#>, -L <#> (parameters for findPeaks)\n";
        print STDERR "\t\t-genome <genome version> (genome version to use for annotation)\n";
        print STDERR "\t\t-DESeq2 | -DESeq | -edgeR (differential stats algorithm, default: DESeq2)\n";
        print STDERR "\t\t-balanced (normalize signal across peaks, default: normalize to read totals)\n";
        print STDERR "\t\t-fragLength <#> (standardize estimated fragment length across analysis)\n";
        print STDERR "\t\t-all (report all peaks, not just differentially regulated)\n";
        print STDERR "\n\tPeak finding directives:\n";
        print STDERR "\t\t-style <factor|histone|tss> (findPeaks style to use for finding peaks)\n";
        print STDERR "\t\t-use <peaks.txt|regions.txt|tss.txt> (use existing peaks in tag directories)\n";
        print STDERR "\t\t-p <peak file> (use specific peak file instead of tagDir/peaks.txt or finding new one)\n";
        print STDERR "\t\tOther options will be passed to findPeaks\n";
        print STDERR "\n";
        exit;
    
    }
    
    my @targets = ();
    my @background = ();
    my @inputs = ();
    my $findPeaksOpts = "";
    my $use = "";
    my $givenPeakFile = '';
    my $norm2total = "-norm2total";
    my $diffAlg = "-DESeq2";
    my $genome = 'none';
    my $annOptions = "";
    my $fragLength = '';
    my $allFlag = 0;
    my $ogCmd = "getDifferentialPeaksReplicates.pl";
    for (my $i=0;$i<@ARGV;$i++) {
        $ogCmd .= " " . $ARGV[$i];
    }
    
    for (my $i=0;$i<@ARGV;$i++) {
        if ($ARGV[$i] eq '-t') {
            $i++;
            while ($i < @ARGV) {
                if ($ARGV[$i] =~ /^-/) {
                    $i--;
                    last;
                }
                push(@targets, $ARGV[$i++]);
            }
        } elsif ($ARGV[$i] eq '-i') {
            $i++;
            while ($i < @ARGV) {
                if ($ARGV[$i] =~ /^-/) {
                    $i--;
                    last;
                }
                push(@inputs, $ARGV[$i++]);
            }
        } elsif ($ARGV[$i] eq '-b') {
            $i++;
            while ($i < @ARGV) {
                if ($ARGV[$i] =~ /^-/) {
                    $i--;
                    last;
                }
                push(@background, $ARGV[$i++]);
            }
        } elsif ($ARGV[$i] eq '-f') {
            $foldThresh = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-genome') {
            $genome = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-q') {
            $fdrThresh = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-use') {
            $use = $ARGV[++$i];
            if ($use eq 'tss.txt') {
                $annOptions .= " -fragLength 1 -strand +";
                $fragLength = " -fragLength 1" if ($fragLength eq '');
            }
        } elsif ($ARGV[$i] eq '-p') {
            $givenPeakFile = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-style') {
            $style = $ARGV[++$i];
            if ($style eq 'tss') {
                $annOptions .= " -fragLength 1 -strand +";
                $fragLength = " -fragLength 1" if ($fragLength eq '');
            }
        } elsif ($ARGV[$i] eq '-edgeR') {
            $diffAlg = $ARGV[$i];
        } elsif ($ARGV[$i] eq '-fragLength') {
            $fragLength = " -fragLength $ARGV[++$i]";
        } elsif ($ARGV[$i] eq '-DESeq2') {
            $diffAlg = $ARGV[$i];
        } elsif ($ARGV[$i] eq '-all') {
            $allFlag = 1;
        } elsif ($ARGV[$i] eq '-DESeq') {
            $diffAlg = $ARGV[$i];
        } elsif ($ARGV[$i] eq '-balanced') {
            $norm2total = "";
        } elsif ($ARGV[$i] eq '-h' || $ARGV[$i] eq '--help' || $ARGV[$i] eq '--') {
            printCMD();
        } else {
            $findPeaksOpts .= " " . $ARGV[$i];
            #print STDERR "!!! \"$ARGV[$i]\" not recognized\n";
            #printCMD();
        }
    }
    my $rand = rand();
    my %toDelete = ();
    if ($diffAlg eq '-edgeR' && $norm2total ne '') {
        print STDERR "!!! Error, -edgeR requires \"-balanced\" to work correctly!!!\n";
        exit;
    }
    $log2Thresh = log($foldThresh)/log(2);
    if (@targets < 1) {
        print STDERR "!!! Error, need at least one target directory!!!\n";
        printCMD();
    }
    if (scalar(@inputs) + scalar(@background) < 1) {
        print STDERR "\t!!! Error: program requires either input or background experiments to perform differential calculations!\n";
        exit;
    }
    my $targetDirs = '';
    my $targetStr = "";
    foreach(@targets) {
        $targetDirs .= " \"$_\"";
        $targetStr .= " target";
    }
    my $inputDirs = '';
    my $inputStr = "";
    foreach(@inputs) {
        $inputDirs .= " \"$_\"";
        $inputStr .= " input";
    }
    my $bgDirs = '';
    my $bgStr = "";
    foreach(@background) {
        $bgDirs .= " \"$_\"";
        $bgStr .= " bg";
    }
    
    if ($givenPeakFile eq '') {
        $peakFile = $rand . ".peaks";
        $toDelete{$peakFile}=1;
    } else {
        $peakFile = $givenPeakFile;
    }
    
    if ($findPeaksOpts ne '') {
        print STDERR "\tUsing the following extra parameters for findPeaks:\n\t\t$findPeaksOpts\n";
    }
    
    print STDERR "\tStep1: Defining Putative Peak Set\n";
    if ($use eq '' && $givenPeakFile eq '') {
        print STDERR "\t\tFinding peaks in merged meta-experiment from target tag directories\n";
        my $targetDir = $rand . ".targetTagDir";
        `makeTagDirectory \"$targetDir\" -d $targetDirs $fragLength`;
        $toDelete{$targetDir}=1;
        my $inputDir = $rand . ".inputTagDir";
        my $cmd = "findPeaks \"$targetDir\"";
        if ($style eq '') {
            $style = 'factor';
            print STDERR "\tUsing -style $style...\n";
        }
        $cmd .= " -style $style";
        $cmd .= $fragLength . " " . $findPeaksOpts;
        if (@inputs > 0) {
            `makeTagDirectory \"$inputDir\" -d $inputDirs $fragLength`;
            $toDelete{$inputDir}=1;
            $cmd .= " -i \"$inputDir\"";
        }
        #print STDERR "`$cmd > \"$peakFile\"`\n";
        `$cmd > \"$peakFile\"`;
    } elsif ($givenPeakFile eq '' && $use ne '') {
        my $files = "";
        my @allDirs = ();
        push(@allDirs, @targets, @inputs, @background);
        print STDERR "\t\tUsing existing peak files for features:\n";
        foreach(@allDirs) {
            my $p = $_ . "/" . $use;
            if (-e $p) {
                print STDERR "\t\t\t$p\n";
                $files .= " \"$p\"";
            }
        }
        #MODIFIED
        #`mergePeaks $files > \"$peakFile\"`;
        #print "$files\n";
        #print "\"$peakFile\"\n";
        `mergePeaks $files -d 35000 >  mergePeaks_res.txt`;
        `python3 update_header.py`;
        `cat $files > merged_peaks.txt`;
        ## Split the list of files into an array
        #my @files = split / /, $files;
        # Open the output file
        #open(my $out, '>', 'merged_peaks.txt') or die "Cannot open merged_peaks.txt: $!";
        #foreach my $file (@files) {
        #   print("xxxxx$file\n");
        #   open(my $in, '<', $file) or die "Cannot open $file: $!";
        #   while (<$in>) {
        #       s/\t+$//; # removes trailing tabs
        #       print $out "$_\n";
        #   }
        #   close $in;
        #}
        #close $out;
        system("awk '{print \$2 \"\\t\" \$3 \"\\t\" \$4 \"\\t\" \$1}' merged_peaks.txt | sort -k1,1 -k2,2n | bedtools merge -d 35000 > bedtools_res.txt");
        `python3 adjust_mergePeaks_res.py`;
        `mv filtered_mergePeaks_res.txt \"$peakFile\"`;
    }
    
    $rawFile= $rand . ".raw.txt";
    $diffFile= $rand . ".diff.txt";
    $upFile = $rand . ".Up_target_vs_bg.txt";
    $downFile = $rand . ".Down_target_vs_bg.txt";
    if ($bgDirs eq '') {
        $bgDirs = $inputDirs;
        $bgStr = $inputStr;
        @background = @inputs;
        $upFile = $rand . ".Up_target_vs_input.txt";
        $downFile = $rand . ".Down_target_vs_input.txt";
    }
    
    print STDERR "\n\tStep2: Quantifying reads across target/background/input tag directories\n\n";
    #print STDERR "`annotatePeaks.pl \"$peakFile\" none -d $bgDirs $targetDirs -raw > \"$rawFile\"`;\n";
    `annotatePeaks.pl \"$peakFile\" $genome -d $bgDirs $targetDirs -raw $annOptions $fragLength > \"$rawFile\"`;
    #print STDERR "`getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;\n";
    #
    
    print STDERR "\n\tStep3: Calling R for differential enrichment statistics ($diffAlg)\n\n";
    `getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;
    $toDelete{$rawFile}=1;
    $toDelete{$diffFile}=1;
    $toDelete{$upFile}=1;
    $toDelete{$downFile}=1;
    print "#cmd=$ogCmd|";
    my $ofile = $upFile;
    $ofile = $diffFile if ($allFlag);
    open IN, $ofile;
    while (<IN>) {
        print $_;
    }
    close IN;
    
    foreach(keys %toDelete) {
        next if ($_ eq '/');
        #print STDERR "\trm -r \"$_\"\n";
        `rm -r "$_"`;
    }
    
  2. getDifferentialPeaksReplicates_narrow.pl (code)

    #!/usr/bin/env perl
    use warnings;
    use lib "/home/jhuang/anaconda3/envs/myperl/share/homer/.//bin";
    my $homeDir = "/home/jhuang/anaconda3/envs/myperl/share/homer/./";
    
    my $foldThresh = 2;
    my $fdrThresh = 0.05;
    my $peakFoldInput = 2;
    my $peakFdrInput = 0.001;
    my $style = "";
    
    sub printCMD {
        print STDERR "\n\tUsage: getDifferentialPeaksReplicates.pl [options] -t <IP tagdir1> [IP tagdir2] ...\n";
        print STDERR "\t                                         -b <background tagdir1> [background tagdir2] ...\n";
        print STDERR "\t                                         -i <Input tagdir1> [Input tagdir2] ...\n";
        print STDERR "\t\tNote: if input is provided, peaks will be called.\n";
        print STDERR "\n\tOptions:\n";
        #print STDERR "\t\t-F <#> (fold enrichment over bg, default: $foldThresh)\n";
        #print STDERR "\t\t-fdr <#> (FDR over input, default: $fdrThresh)\n";
        print STDERR "\t\t-f <#> (fold enrichment over bg, default: $foldThresh)\n";
        print STDERR "\t\t-q <#> (FDR over bg, default: $fdrThresh)\n";
        print STDERR "\t\t-fdr <#>, -F <#>, -L <#> (parameters for findPeaks)\n";
        print STDERR "\t\t-genome <genome version> (genome version to use for annotation)\n";
        print STDERR "\t\t-DESeq2 | -DESeq | -edgeR (differential stats algorithm, default: DESeq2)\n";
        print STDERR "\t\t-balanced (normalize signal across peaks, default: normalize to read totals)\n";
        print STDERR "\t\t-fragLength <#> (standardize estimated fragment length across analysis)\n";
        print STDERR "\t\t-all (report all peaks, not just differentially regulated)\n";
        print STDERR "\n\tPeak finding directives:\n";
        print STDERR "\t\t-style <factor|histone|tss> (findPeaks style to use for finding peaks)\n";
        print STDERR "\t\t-use <peaks.txt|regions.txt|tss.txt> (use existing peaks in tag directories)\n";
        print STDERR "\t\t-p <peak file> (use specific peak file instead of tagDir/peaks.txt or finding new one)\n";
        print STDERR "\t\tOther options will be passed to findPeaks\n";
        print STDERR "\n";
        exit;
    
    }
    
    my @targets = ();
    my @background = ();
    my @inputs = ();
    my $findPeaksOpts = "";
    my $use = "";
    my $givenPeakFile = '';
    my $norm2total = "-norm2total";
    my $diffAlg = "-DESeq2";
    my $genome = 'none';
    my $annOptions = "";
    my $fragLength = '';
    my $allFlag = 0;
    my $ogCmd = "getDifferentialPeaksReplicates.pl";
    for (my $i=0;$i<@ARGV;$i++) {
        $ogCmd .= " " . $ARGV[$i];
    }
    
    for (my $i=0;$i<@ARGV;$i++) {
        if ($ARGV[$i] eq '-t') {
            $i++;
            while ($i < @ARGV) {
                if ($ARGV[$i] =~ /^-/) {
                    $i--;
                    last;
                }
                push(@targets, $ARGV[$i++]);
            }
        } elsif ($ARGV[$i] eq '-i') {
            $i++;
            while ($i < @ARGV) {
                if ($ARGV[$i] =~ /^-/) {
                    $i--;
                    last;
                }
                push(@inputs, $ARGV[$i++]);
            }
        } elsif ($ARGV[$i] eq '-b') {
            $i++;
            while ($i < @ARGV) {
                if ($ARGV[$i] =~ /^-/) {
                    $i--;
                    last;
                }
                push(@background, $ARGV[$i++]);
            }
        } elsif ($ARGV[$i] eq '-f') {
            $foldThresh = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-genome') {
            $genome = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-q') {
            $fdrThresh = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-use') {
            $use = $ARGV[++$i];
            if ($use eq 'tss.txt') {
                $annOptions .= " -fragLength 1 -strand +";
                $fragLength = " -fragLength 1" if ($fragLength eq '');
            }
        } elsif ($ARGV[$i] eq '-p') {
            $givenPeakFile = $ARGV[++$i];
        } elsif ($ARGV[$i] eq '-style') {
            $style = $ARGV[++$i];
            if ($style eq 'tss') {
                $annOptions .= " -fragLength 1 -strand +";
                $fragLength = " -fragLength 1" if ($fragLength eq '');
            }
        } elsif ($ARGV[$i] eq '-edgeR') {
            $diffAlg = $ARGV[$i];
        } elsif ($ARGV[$i] eq '-fragLength') {
            $fragLength = " -fragLength $ARGV[++$i]";
        } elsif ($ARGV[$i] eq '-DESeq2') {
            $diffAlg = $ARGV[$i];
        } elsif ($ARGV[$i] eq '-all') {
            $allFlag = 1;
        } elsif ($ARGV[$i] eq '-DESeq') {
            $diffAlg = $ARGV[$i];
        } elsif ($ARGV[$i] eq '-balanced') {
            $norm2total = "";
        } elsif ($ARGV[$i] eq '-h' || $ARGV[$i] eq '--help' || $ARGV[$i] eq '--') {
            printCMD();
        } else {
            $findPeaksOpts .= " " . $ARGV[$i];
            #print STDERR "!!! \"$ARGV[$i]\" not recognized\n";
            #printCMD();
        }
    }
    my $rand = rand();
    my %toDelete = ();
    if ($diffAlg eq '-edgeR' && $norm2total ne '') {
        print STDERR "!!! Error, -edgeR requires \"-balanced\" to work correctly!!!\n";
        exit;
    }
    $log2Thresh = log($foldThresh)/log(2);
    if (@targets < 1) {
        print STDERR "!!! Error, need at least one target directory!!!\n";
        printCMD();
    }
    if (scalar(@inputs) + scalar(@background) < 1) {
        print STDERR "\t!!! Error: program requires either input or background experiments to perform differential calculations!\n";
        exit;
    }
    my $targetDirs = '';
    my $targetStr = "";
    foreach(@targets) {
        $targetDirs .= " \"$_\"";
        $targetStr .= " target";
    }
    my $inputDirs = '';
    my $inputStr = "";
    foreach(@inputs) {
        $inputDirs .= " \"$_\"";
        $inputStr .= " input";
    }
    my $bgDirs = '';
    my $bgStr = "";
    foreach(@background) {
        $bgDirs .= " \"$_\"";
        $bgStr .= " bg";
    }
    
    if ($givenPeakFile eq '') {
        $peakFile = $rand . ".peaks";
        $toDelete{$peakFile}=1;
    } else {
        $peakFile = $givenPeakFile;
    }
    
    if ($findPeaksOpts ne '') {
        print STDERR "\tUsing the following extra parameters for findPeaks:\n\t\t$findPeaksOpts\n";
    }
    
    print STDERR "\tStep1: Defining Putative Peak Set\n";
    if ($use eq '' && $givenPeakFile eq '') {
        print STDERR "\t\tFinding peaks in merged meta-experiment from target tag directories\n";
        my $targetDir = $rand . ".targetTagDir";
        `makeTagDirectory \"$targetDir\" -d $targetDirs $fragLength`;
        $toDelete{$targetDir}=1;
        my $inputDir = $rand . ".inputTagDir";
        my $cmd = "findPeaks \"$targetDir\"";
        if ($style eq '') {
            $style = 'factor';
            print STDERR "\tUsing -style $style...\n";
        }
        $cmd .= " -style $style";
        $cmd .= $fragLength . " " . $findPeaksOpts;
        if (@inputs > 0) {
            `makeTagDirectory \"$inputDir\" -d $inputDirs $fragLength`;
            $toDelete{$inputDir}=1;
            $cmd .= " -i \"$inputDir\"";
        }
        #print STDERR "`$cmd > \"$peakFile\"`\n";
        `$cmd > \"$peakFile\"`;
    } elsif ($givenPeakFile eq '' && $use ne '') {
        my $files = "";
        my @allDirs = ();
        push(@allDirs, @targets, @inputs, @background);
        print STDERR "\t\tUsing existing peak files for features:\n";
        foreach(@allDirs) {
            my $p = $_ . "/" . $use;
            if (-e $p) {
                print STDERR "\t\t\t$p\n";
                $files .= " \"$p\"";
            }
        }
        `mergePeaks -d 2000 $files > \"$peakFile\"`;
        #`cat $files | sort -k1,1 -k2,2n | bedtools merge > \"$peakFile\"`;
    }
    
    $rawFile= $rand . ".raw.txt";
    $diffFile= $rand . ".diff.txt";
    $upFile = $rand . ".Up_target_vs_bg.txt";
    $downFile = $rand . ".Down_target_vs_bg.txt";
    if ($bgDirs eq '') {
        $bgDirs = $inputDirs;
        $bgStr = $inputStr;
        @background = @inputs;
        $upFile = $rand . ".Up_target_vs_input.txt";
        $downFile = $rand . ".Down_target_vs_input.txt";
    }
    
    print STDERR "\n\tStep2: Quantifying reads across target/background/input tag directories\n\n";
    #print STDERR "`annotatePeaks.pl \"$peakFile\" none -d $bgDirs $targetDirs -raw > \"$rawFile\"`;\n";
    `annotatePeaks.pl \"$peakFile\" $genome -d $bgDirs $targetDirs -raw $annOptions $fragLength > \"$rawFile\"`;
    #print STDERR "`getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;\n";
    #
    
    print STDERR "\n\tStep3: Calling R for differential enrichment statistics ($diffAlg)\n\n";
    `getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;
    $toDelete{$rawFile}=1;
    $toDelete{$diffFile}=1;
    $toDelete{$upFile}=1;
    $toDelete{$downFile}=1;
    print "#cmd=$ogCmd|";
    my $ofile = $upFile;
    $ofile = $diffFile if ($allFlag);
    open IN, $ofile;
    while (<IN>) {
        print $_;
    }
    close IN;
    
    #foreach(keys %toDelete) {
    #   next if ($_ eq '/');
    #   #print STDERR "\trm -r \"$_\"\n";
    #   `rm -r "$_"`;
    #}
    
  3. update_header.py (code)

    import os
    
    # File path
    file_path = 'mergePeaks_res.txt'
    
    # Read in the file
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    # Split the first line into components
    components = lines[0].split("\t")
    
    # Change the first component to "name"
    components[0] = "name"
    
    # Join the components back into a single string
    lines[0] = "\t".join(components)
    
    # Write the file back out
    with open(file_path, 'w') as file:
        file.writelines(lines)
    
  4. adjust_mergePeaks_res.py (code)

    import pandas as pd
    
    # Read the mergePeaks result file and bedtools result file
    mergePeaks_df = pd.read_csv('mergePeaks_res.txt', sep='\t', header=0)
    bedtools_df = pd.read_csv('bedtools_res.txt', sep='\t', header=None, names=['chr', 'start', 'end'])
    
    # Function to check if a peak is within bedtools ranges
    # def is_in_bedtools_range(row):
    filtered_mergePeaks_df = []
    for _, row in mergePeaks_df.iterrows():
        for _, bed_row in bedtools_df.iterrows():
            if row['chr'] == bed_row['chr'] and row['start'] >= bed_row['start'] and row['end'] <= bed_row['end']:
                row['start'] = bed_row['start']
                row['end'] = bed_row['end']
                filtered_mergePeaks_df.append(row)
    
    # Convert the filtered results to a DataFrame
    filtered_mergePeaks_df = pd.DataFrame(filtered_mergePeaks_df)
    
    ## Write the filtered results to a file
    #filtered_mergePeaks_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False)
    
    ## Filter the mergePeaks rows
    #filtered_mergePeaks_df = mergePeaks_df[mergePeaks_df.apply(is_in_bedtools_range, axis=1)]
    
    ## Sort and drop duplicates
    sorted_df = filtered_mergePeaks_df.sort_values(by=['chr', 'start', 'end'])
    deduplicated_df = sorted_df.drop_duplicates(subset=['chr', 'start', 'end'])
    
    ## Save to file
    deduplicated_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False)
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum