gene_x 0 like s 603 view s
Tags: pipeline, ChIP-seq
nextflow processing data
(chipseq) jhuang@hamm:/mnt/h1/jhuang/DATA/Data_Denise_LT_DNA_Binding/Histone-ChIP_hg38/H3K27ac_H3K4me1_public$ nextflow run NGI-ChIPseq/main.nf --reads '/mnt/h1/jhuang/DATA/Data_Denise_LT_DNA_Binding/Histone-ChIP_hg38/H3K27ac_H3K4me1_public/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
./picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bam
./picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bam
./picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bam
./picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bam
./picard/V_8_3_1_p600_601_d12_D1_H3K4me3.dedup.sorted.bam
./picard/V_8_3_2_p600_601_d9_D2_H3K4me3.dedup.sorted.bam
./picard/V_8_4_2_p602_d8_D1_H3K4me3.dedup.sorted.bam
./picard/V_8_4_1_p602_d8_D2_H3K4me3.dedup.sorted.bam
./picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bam
./picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bam
./picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bam
./picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bam
make homer directories and findPeaks with HOMER under (conda homer, but note that homer is installed under Tools manually!)
#HOMER will be installed in the same directory that you place the configureHomer.pl program. configureHomer.pl will attempt to check for required utilities and alert you to missing programs.
1. mkdir ~/Tools/homer; cd ~/Tools/homer
2. wget http://homer.ucsd.edu/homer/configureHomer.pl
3. Run the configureHomer.pl script to install homer: perl configureHomer.pl -install
4. Add the homer/bin directory to your executable path. For example, edit your ~/.bashrc file to include the line:
PATH=$PATH:/home/jhuang/Tools/homer/bin/
5. Reset your terminal so that the changes to the PATH variable take effect: source ~/.bashrc
#jhuang@hamm:~/Tools/homer/bin$ wget http://homer.ucsd.edu/homer/configureHomer.pl
#configureHomer.pl -install
#chmod +x /home/jhuang/miniconda3/envs/homer/bin/configureHomer.pl
#ls ~/miniconda3/envs/homer/bin
#echo $PATH
#If /home/jhuang/miniconda3/envs/homer/bin is not in the output, you'll need to add it to your $PATH.
chmod +x ~/Tools/homer/configureHomer.pl
./configureHomer.pl -install hg38
./configureHomer.pl -list
#conda update -n base -c defaults conda
conda create -n homer
conda activate homer
#(NOT_USE!) conda install -c bioconda homer
mamba install r-essentials bioconductor-deseq2
mamba install samtools
#bioconductor-edger wget
#http://homer.ucsd.edu/homer/introduction/update.html
#http://homer.ucsd.edu/homer/introduction/install.html
conda install -c bioconda ucsc-bedgraphtobigwig # install bedGraphToBigWig on @homer
conda install -c bioconda macs2
conda install -c anaconda pandas
#under (myperl) on computer @hamburg
mkdir homer bigWigs macs2 sicer
cd homer
#Why do I need give "-genome hg38" in makeTagDirectory? If you don't provide a genome with the -genome option, HOMER will only count the number of tags in each region without any genomic context or sequence information. #So, it is essential to include this information when creating a tag directory if you plan to perform any genome-based analysis.
makeTagDirectory p600_601_d12_D1_input ../results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bam -genome hg38
makeTagDirectory p600_601_d9_D2_input ../results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bam -genome hg38
makeTagDirectory p602_d8_D1_input ../results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bam -genome hg38
makeTagDirectory p602_d8_D2_input ../results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bam -genome hg38
makeTagDirectory p600_601_d12_D1_H3K4me3 ../results/picard/V_8_3_1_p600_601_d12_D1_H3K4me3.dedup.sorted.bam -genome hg38
makeTagDirectory p600_601_d9_D2_H3K4me3 ../results/picard/V_8_3_2_p600_601_d9_D2_H3K4me3.dedup.sorted.bam -genome hg38
makeTagDirectory p602_d8_D1_H3K4me3 ../results/picard/V_8_4_2_p602_d8_D1_H3K4me3.dedup.sorted.bam -genome hg38
makeTagDirectory p602_d8_D2_H3K4me3 ../results/picard/V_8_4_1_p602_d8_D2_H3K4me3.dedup.sorted.bam -genome hg38
makeTagDirectory p600_601_d12_D1_H3K27me3 ../results/picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bam -genome hg38
makeTagDirectory p600_601_d9_D2_H3K27me3 ../results/picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bam -genome hg38
makeTagDirectory p602_d8_D1_H3K27me3 ../results/picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bam -genome hg38
makeTagDirectory p602_d8_D2_H3K27me3 ../results/picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bam -genome hg38
for sample in p600_601_d12_D1_input p600_601_d9_D2_input p602_d8_D1_input p602_d8_D2_input p600_601_d12_D1_H3K4me3 p600_601_d9_D2_H3K4me3 p602_d8_D1_H3K4me3 p602_d8_D2_H3K4me3 p600_601_d12_D1_H3K27me3 p600_601_d9_D2_H3K27me3 p602_d8_D1_H3K27me3 p602_d8_D2_H3K27me3; do
makeUCSCfile ${sample} -bigWig /home/jhuang/REFs/hg38.chromSizes -o auto -style chipseq #-pseudo 1 -norm 1e7 -normLength 100 -fsize 1
mv ${sample}/${sample}.ucsc.bigWig ../bigWigs/
done
#Note that normLength should possibly read the fragmentation length from phantompeakqualtools. Easier to set all libraries to the default value: 100.
#To normalize to a fragment length of 150, you would use -normLength 150.
#To turn off normalization, you would use -normLength 0.
# -- not necessary any more: using MACS2 and SICER instead of using findPeaks ------------------------> go STEPs 4.1 and 5.1.
# #factor (transcription factor ChIP-Seq, uses -center, output: peaks.txt, default)
# #histone (histone modification ChIP-Seq, region based, uses -region -size 500 -L 0, regions.txt)
# for sample in p601_d8_D1 p601_d8_D2 p604_d8_D1 p604_d8_D2; do
# #Finding peaks of size 1000, no closer than 2000
# findPeaks ${sample}_H3K4me3 -style factor -size 1000 -o auto -i ${sample}_input
# #-minDist <#> (minimum distance between peaks, default: peak size x2)
# #findPeaks ${sample}_H3K27me3 -style histone -region -size 3000 -minDist 5000 -o auto -i ${sample}_input
# #findPeaks ${sample}_H3K27ac -style factor -size 200 -minDist 200 -o auto -i ${sample}_input
# #findPeaks ${sample}_H3K4me1 -style histone -region -size 1000 -minDist 2500 -o auto -i ${sample}_input
# done
./p601_d8_D1_H3K4me3/peaks.txt
./p601_d8_D2_H3K4me3/peaks.txt
./p604_d8_D1_H3K4me3/peaks.txt
./p604_d8_D2_H3K4me3/peaks.txt
./p601_d8_D1_H3K27me3/regions.txt
./p601_d8_D2_H3K27me3/regions.txt
./p604_d8_D1_H3K27me3/regions.txt
./p604_d8_D2_H3K27me3/regions.txt
for dir in p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3; do
awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/peaks.txt > ${dir}_peaks.bed
grep -v "#" ${dir}_peaks.bed | sort -k1,1 -k2,2n > ${dir}_sorted_peaks.bed
done
for dir in p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3; do
awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/regions.txt > ${dir}_regions.bed
grep -v "#" ${dir}_regions.bed | sort -k1,1 -k2,2n > ${dir}_sorted_regions.bed
done
#DEBUG: why the bam files so small?
makeTagDirectory NHDF-Ad_Control_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r1.dedup.sorted.bam -genome hg38
makeTagDirectory NHDF-Ad_Control_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r2.dedup.sorted.bam -genome hg38
makeTagDirectory NHDF-Ad_H3K27ac_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r1.dedup.sorted.bam -genome hg38
makeTagDirectory NHDF-Ad_H3K27ac_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r2.dedup.sorted.bam -genome hg38
makeTagDirectory NHDF-Ad_H3K4me1_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r1.dedup.sorted.bam -genome hg38
makeTagDirectory NHDF-Ad_H3K4me1_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r2.dedup.sorted.bam -genome hg38
NHDF-Ad_Control_r1 NHDF-Ad_Control_r2 NHDF-Ad_H3K27ac_r1 NHDF-Ad_H3K27ac_r2 NHDF-Ad_H3K4me1_r1 NHDF-Ad_H3K4me1_r2
> (myperl) environments for HOMER, ~/Tools/diffreps/bin/diffReps.pl, MACS2, ~/Tools/SICER1.1/SICER/SICER.sh
2.5. diffbind (NOT_USED) https://hbctraining.github.io/Intro-to-ChIPseq/lessons/08_diffbind_differential_peaks.html #-->identifying statistically significantly differentially bound sites https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02686-y
#samplesheet.csv
#SampleID, Condition, Peaks, Counts
#Sample1, Control, sample1_peaks.bed, sample1.bam
#Sample2, Treatment, sample2_peaks.bed, sample2.bam
library(DiffBind)
samples <- dba(sampleSheet="samplesheet.csv")
3.0. combine the diffReps.pl and HOMER annotatePeaks.pl, so we can use for hg38 and mm10 and so on (IMPORTANT, DON'T NEED --gname hg38, run diffReps.pl --> bed_file --> annotatePeaks.pl it with HOMER)
#Dynamic regions were defined as MACS (H3K4me3, H3K27ac) or SICER (H3K4me1, H3K27me3) peaks overlapping significantly (≥ 2-fold change, adjusted P-value ≤ 0.05) up- or down-regulated differentially enriched regions from diffReps in the three pairwise comparisons WAC vs mock, WA314 vs mock and WAC vs WA314.
#STEP1
#--> not given "--gname hg38"
## Step4: Annotate differential sites.
#unless($noanno or $gname eq ''){
# `region_analysis.pl -i $report -r -d refseq -g $gname`;
#}
## Step5: Look for hotspots.
#unless($nohs){
# my $hotspot = $report . '.hotspot';
# `findHotspots.pl -d $report -o $hotspot`;
#}
~/Tools/diffreps/bin/diffReps.pl -tr ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bed -co ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed --report output_results --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd sharp --noanno
~/Tools/diffreps/bin/diffReps.pl -tr ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bed -co ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed --report p602_vs_p600_601_H3K4me3_diff_out --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd sharp --noanno
~/Tools/diffreps/bin/diffReps.pl --treatment ./results/picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bed ./results/picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bed --btr ./results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bed ./results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bed --control ./results/picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bed ./results/picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bed --bco ./results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bed ./results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bed --report p602_vs_p600_601_H3K27me3_diff_out --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd broad --mode block --window 10000 --gap 30000 --noanno
#I have utilized 'diffReps' for this analysis because the methods I previously mentioned in my last email were not effective in identifying sites with statistically significant differential binding. 'diffReps' appears to be a more suitable tool for our needs. Below is the command I used. It's important to note that 'diffReps' typically requires replicates as input and outputs a set of peaks/regions.
#STEP2
#replace Chr to '#Chr'
grep -v "#" output_results | sort -k1,1 -k2,2n > output_results_
awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' output_results_ > H3K4me3.bed
#grep -v "#" H3K4me3.bed | sort -k1,1 -k2,2n > H3K4me3_sorted_peaks.bed
grep -v "#" p602_vs_p600_601_H3K4me3_diff_out | sort -k1,1 -k2,2n > p602_vs_p600_601_H3K4me3_diff_out_
awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' p602_vs_p600_601_H3K4me3_diff_out_ > p602_vs_p600_601_H3K4me3.bed
grep -v "#" p602_vs_p600_601_H3K27me3_diff_out | sort -k1,1 -k2,2n > p602_vs_p600_601_H3K27me3_diff_out_
awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' p602_vs_p600_601_H3K27me3_diff_out_ > p602_vs_p600_601_H3K27me3.bed
#STEP3 (under myperl) peak calling macs2 for narrow peaks, CISER for broad peaks!
#process the output of diffReps.pl to BED file.
annotatePeaks.pl H3K4me3.bed hg38 > H3K4me3_annotated_peaks.txt
annotatePeaks.pl p602_vs_p600_601_H3K4me3.bed hg38 > p602_vs_p600_601_H3K4me3_annotated_peaks.txt
annotatePeaks.pl p602_vs_p600_601_H3K27me3.bed hg38 > p602_vs_p600_601_H3K27me3_annotated_peaks.txt
(head -n 1 p602_vs_p600_601_H3K4me3_annotated_peaks.txt && tail -n +2 p602_vs_p600_601_H3K4me3_annotated_peaks.txt | sort -t "_" -k 3 -n) > p602_vs_p600_601_H3K4me3_annotated_peaks_.txt
(head -n 1 p602_vs_p600_601_H3K27me3_annotated_peaks.txt && tail -n +2 p602_vs_p600_601_H3K27me3_annotated_peaks.txt | sort -t "_" -k 3 -n) > p602_vs_p600_601_H3K27me3_annotated_peaks_.txt
~/Tools/csv2xls-0.4/csv_to_xls.py p602_vs_p600_601_H3K4me3_annotated_peaks_.txt -d$'\t' -o p602_vs_p600_601_H3K4me3_annotated_peaks.xls
~/Tools/csv2xls-0.4/csv_to_xls.py p602_vs_p600_601_H3K27me3_annotated_peaks_.txt -d$'\t' -o p602_vs_p600_601_H3K27me3_annotated_peaks.xls
4.0. combine macs2 to getDifferentialPeaksReplicates.pl
replace the initial peak identification by using your MACS2 output.
#http://homer.ucsd.edu/homer/ngs/diffExpression.html
#getDifferentialPeaksReplicates.pl = findPeaks + annotatePeaks.pl + getDiffExpression.pl
#annotatePeaks.pl tss hg38 -raw -d H3K4me3-Mock-rep1/ H3K4me3-Mock-rep2/ H3K4me3-WNT-rep1/ H3K4me3-WNT-rep3/ > countTable.peaks.txt
Here's an outline of how we might be able to replace the initial peak identification by using your MACS2 output.
#TODO: using MACS call peaks of the data H3K27ac.
4.1. MACS2 peak calling
# -- macs2 --> bed --> annotatePeaks.pl
conda activate homer
cd macs2
macs2 callpeak -t ../results/picard/V_8_3_1_p600_601_d12_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bam -f BAM -g hs -n p600_601_d12_D1 -q 0.05
macs2 callpeak -t ../results/picard/V_8_3_2_p600_601_d9_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bam -f BAM -g hs -n p600_601_d9_D2 -q 0.05
macs2 callpeak -t ../results/picard/V_8_4_2_p602_d8_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bam -f BAM -g hs -n p602_d8_D1 -q 0.05
macs2 callpeak -t ../results/picard/V_8_4_1_p602_d8_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bam -f BAM -g hs -n p602_d8_D2 -q 0.05
awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p600_601_d12_D1_peaks.narrowPeak > p600_601_d12_D1_peaks.bed
awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p600_601_d9_D2_peaks.narrowPeak > p600_601_d9_D2_peaks.bed
awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p602_d8_D1_peaks.narrowPeak > p602_d8_D1_peaks.bed
awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p602_d8_D2_peaks.narrowPeak > p602_d8_D2_peaks.bed
#annotatePeaks.pl p601_d8_D1_peaks.bed hg38 > p601_d8_D1_annotated_peaks.txt
#annotatePeaks.pl p601_d8_D2_peaks.bed hg38 > p601_d8_D2_annotated_peaks.txt
#annotatePeaks.pl p604_d8_D1_peaks.bed hg38 > p604_d8_D1_annotated_peaks.txt
#annotatePeaks.pl p604_d8_D2_peaks.bed hg38 > p604_d8_D2_annotated_peaks.txt
4.2. Convert your MACS2 peaks to HOMER-compatible format. You can do this manually or with a script. For example:
It's possible to use more information from the MACS2 output file to create a more informative peaks.txt file for HOMER. However, it's important to note that some information that HOMER needs for its differential peak analysis is not available in the MACS2 output (such as Normalized Tag Count, Control Tags, and others). But we can certainly map more of the available MACS2 columns to the corresponding HOMER columns.
#The following awk command can be used to convert more MACS2 information into the HOMER format:
cd macs2
#awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"Peak_"NR,$5,$6,$7,$8,$9,$10}' macs2_peaks.bed > macs2_peaks.txt
awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p600_601_d12_D1_peaks.xls > p600_601_d12_D1_macs2_peaks.txt
awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p600_601_d9_D2_peaks.xls > p600_601_d9_D2_macs2_peaks.txt
awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p602_d8_D1_peaks.xls > p602_d8_D1_macs2_peaks.txt
awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p602_d8_D2_peaks.xls > p602_d8_D2_macs2_peaks.txt
This command will:
* Skip the header line (NR > 1)
* Map the MACS2 peak name ($10) to the HOMER PeakID
* Map the MACS2 chromosome, start, and end ($1, $2, $3) to the HOMER chr, start, end
* Use a placeholder "+" for the HOMER strand
* Use a placeholder "0" for the HOMER Normalized Tag Count and Focus Ratio
* Map the MACS2 pileup ($6) to the HOMER findPeaks Score and Total Tags
* Use a placeholder "0" for the HOMER Control Tags
* Map the MACS2 fold_enrichment ($8) to the HOMER Fold Change vs Control
* Map the MACS2 abs_summit ($5) to the HOMER p-value vs Control
* Map the MACS2 -log10(qvalue) ($9) to the HOMER Fold Change vs Local
* Use a placeholder "0" for the HOMER p-value vs Local and Clonal Fold Change
This script is limited by the differences in the information provided by MACS2 and HOMER. While it makes use of as much information as possible from the MACS2 output, some columns in the HOMER format still have to be filled with placeholder values.
4.3. Associate the converted peak files with their respective tag directories. In HOMER, peak files can be associated with a tag directory by placing them in the tag directory with the filename "peaks.txt".
#mv homer/p600_601_d12_D1_H3K4me3/peaks.txt homer/p600_601_d12_D1_H3K4me3/peaks_raw.txt
#mv homer/p600_601_d9_D2_H3K4me3/peaks.txt homer/p600_601_d9_D2_H3K4me3/peaks_raw.txt
#mv homer/p602_d8_D1_H3K4me3/peaks.txt homer/p602_d8_D1_H3K4me3/peaks_raw.txt
#mv homer/p602_d8_D2_H3K4me3/peaks.txt homer/p602_d8_D2_H3K4me3/peaks_raw.txt
cp macs2/p600_601_d12_D1_macs2_peaks.txt homer/p600_601_d12_D1_H3K4me3/peaks.txt
cp macs2/p600_601_d9_D2_macs2_peaks.txt homer/p600_601_d9_D2_H3K4me3/peaks.txt
cp macs2/p602_d8_D1_macs2_peaks.txt homer/p602_d8_D1_H3K4me3/peaks.txt
cp macs2/p602_d8_D2_macs2_peaks.txt homer/p602_d8_D2_H3K4me3/peaks.txt
#33896 p600_601_d12_D1_peaks.bed
#47977 p600_601_d9_D2_peaks.bed
#59049 p602_d8_D1_peaks.bed
#33730 p602_d8_D2_peaks.bed
#CHECK and DELETE headers including "name ..." in the new peaks.txt files!
#Repeat this for each of your tag directories.
4.4. The program getDifferentialPeaksReplicates will essentially perform 3 steps, in the step 2 was modified.
First, it will pool the target tag directories and input directories separately into pooled experiments and perform an initial peak identification (using findPeaks). Pooling the experiments is generally more sensitive than trying to merge the individual peak files coming from each experiment (although this can be done using the "-use
#-- Successful modification of the script getDifferentialPeaksReplicates.pl (Explain how to archieve getDifferentialPeaksReplicates.pl, only for DEBUG, for calculation directly run getDifferentialPeaksReplicates.pl!) --
#The -d parameter in the mergePeaks function in HOMER is used to specify the maximum distance between peak centers
#change Max distance to merge to 30000 bp in getDifferentialPeaksReplicates.pl
#mergePeaks -d 500 [100,1000] temp_sorted | sort
#conda list homer #4.11
mergePeaks -d 2000 p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt > mergePeaks_res1.txt # get 37569 records
mergePeaks -d 2000 p602_d8_D1_H3K4me3/peaks.txt p602_d8_D2_H3K4me3/peaks.txt > mergePeaks_res2.txt # get 44076 records
#(homer) jhuang@hamburg:~/DATA/Data_Denise_LT_DNA_Bindung/results_chipseq_histone_hg38/H3K4me3_H3K27ac__H3K27me3_H3K9me3/homer$
#getDifferentialPeaksReplicates.pl -use <tag_directory1>/peaks.txt,<tag_directory2>/peaks.txt ...
#NOTE that the H3K4me3 has zu viele peaks, we can use the slightly modified getDifferentialPeaksReplicates_narrow.pl, "-d 2000"
./getDifferentialPeaksReplicates_narrow.pl -t p600_601_d12_D1_H3K4me3 p600_601_d9_D2_H3K4me3 -i p600_601_d12_D1_input p600_601_d9_D2_input -genome hg38 -use peaks.txt -style histone > p600_601_d9d12_H3K4me3_macs2_peaks.txt
#p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt Total Name
# X 11530 p600_601_d9_D2_H3K4me3/peaks.txt
#X 2158 p600_601_d12_D1_H3K4me3/peaks.txt
#X X 23881 p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt
# Using DESeq2 to calculate differential expression/enrichment...
# Output Stats input vs. target:
# Total Genes: 37569
# Total Up-regulated in target vs. input: 26340 (70.111%) [log2fold>1, FDR<0.05]
# Total Dn-regulated in target vs. input: 1 (0.003%) [log2fold<-1, FDR<0.05]
./getDifferentialPeaksReplicates_narrow.pl -t p602_d8_D1_H3K4me3 p602_d8_D2_H3K4me3 -i p602_d8_D1_input p602_d8_D2_input -genome hg38 -use peaks.txt -style histone > p602_d8_H3K4me3_macs2_peaks.txt
#p602_d8_D1_H3K4me3/peaks.txt p602_d8_D2_H3K4me3/peaks.txt Total Name
# X 270 p602_d8_D2_H3K4me3/peaks.txt
#X 24939 p602_d8_D1_H3K4me3/peaks.txt
#X X 18867 p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt
# Using DESeq2 to calculate differential expression/enrichment...
# Output Stats input vs. target:
# Total Genes: 44076
# Total Up-regulated in target vs. input: 24203 (54.912%) [log2fold>1, FDR<0.05]
# Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
#"-d 1000"
#p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt Total Name
# X 14165 p600_601_d9_D2_H3K4me3/peaks.txt
#X 3910 p600_601_d12_D1_H3K4me3/peaks.txt
#X X 26162 p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt
#Using DESeq2 to calculate differential expression/enrichment...
# Output Stats input vs. target:
# Total Genes: 44237
# Total Up-regulated in target vs. input: 32706 (73.934%) [log2fold>1, FDR<0.05]
# Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
#p602_d8_D1_H3K4me3/peaks.txt p602_d8_D2_H3K4me3/peaks.txt Total Name
# X 2825 p602_d8_D2_H3K4me3/peaks.txt
#X 29258 p602_d8_D1_H3K4me3/peaks.txt
#X X 20829 p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt
#Using DESeq2 to calculate differential expression/enrichment...
#Output Stats input vs. target:
# Total Genes: 52912
# Total Up-regulated in target vs. input: 32484 (61.393%) [log2fold>1, FDR<0.05]
# Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
4.5. Draw plots
#grep "p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt" p600_601_d9d12_H3K4me3_macs2_peaks.txt | wc -l
#grep -v "p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt" p600_601_d9d12_H3K4me3_macs2_peaks.txt | grep "p600_601_d12_D1_H3K4me3/peaks.txt" | wc -l
#grep -v "p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt" p600_601_d9d12_H3K4me3_macs2_peaks.txt | grep "p600_601_d9_D2_H3K4me3/peaks.txt" | wc -l
#grep "p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt" p602_d8_H3K4me3_macs2_peaks.txt | wc -l
#grep -v "p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt" p602_d8_H3K4me3_macs2_peaks.txt | grep "p602_d8_D1_H3K4me3/peaks.txt" | wc -l
#grep -v "p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt" p602_d8_H3K4me3_macs2_peaks.txt | grep "p602_d8_D2_H3K4me3/peaks.txt" | wc -l
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
venn2(subsets=(1808, 5774, 25124), set_labels=('Donor 1', 'Donor 2'))
plt.title('Peaks between p600+p601 d9 or d12 H3K4me3 Donors')
plt.xlabel('Number of Elements')
plt.ylabel('')
plt.savefig('Peak_Consistency_Between_p600+p601_d9d12_H3K4me3_Donors.png', dpi=300, bbox_inches='tight')
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
venn2(subsets=(9100, 2761, 20623), set_labels=('Donor 1', 'Donor 2'))
plt.title('Peaks between p602 d8 H3K4me3 Donors')
plt.xlabel('Number of Elements')
plt.ylabel('')
plt.savefig('Peak_Consistency_Between_p602_d8_H3K4me3_Donors.png', dpi=300, bbox_inches='tight')
awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p600_601_d9d12_H3K4me3_macs2_peaks.txt > p600+p601_d9d12_H3K4me3_macs2_peaks.bed
awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p602_d8_H3K4me3_macs2_peaks.txt > p602_d8_H3K4me3_macs2_peaks.bed
~/Tools/csv2xls-0.4/csv_to_xls.py p600_601_d9d12_H3K4me3_macs2_peaks.txt -d$'\t' -o p600+p601_d9d12_H3K4me3_macs2_peaks.xls
~/Tools/csv2xls-0.4/csv_to_xls.py p602_d8_H3K4me3_macs2_peaks.txt -d$'\t' -o p602_d8_H3K4me3_macs2_peaks.xls
4.6. identifying statistically significantly differentially bound sites based on evidence of binding affinity (measured by differences in read densities)
##http://homer.ucsd.edu/homer/ngs/peaksReplicates.html
#getDifferentialPeaksReplicates.pl -t Oct4-r1/ Oct4-r2/ -b Sox2-r1/ Sox2-r2/ -i Input-r1/ Input-r2/ > outputPeaks.txt
##first get TSS regions using annotatePeaks.pl
#annotatePeaks.pl tss hg38 > tss.txt
#getDifferentialPeaksReplicates.pl -t H3K27ac-WNT-r1/ H3K27ac-WNT-r2/ -b H3K27ac-notx-r1 H3K27ac-notx-r2/ -p tss.txt > outputPeaks.txt
#- Donor I p602 (d8) vs p600+601 (d12) H3K4me3
#- Donor II p602 (d8) vs p600+601 (d9) H3K4me3
./getDifferentialPeaksReplicates_narrow.pl -t p602_d8_D1_H3K4me3 p602_d8_D2_H3K4me3 -b p600_601_d12_D1_H3K4me3 p600_601_d9_D2_H3K4me3 -genome hg38 -use peaks.txt -style histone > p602_d8_vs_p600_601_d9d12_H3K4me3_diff_peaks.txt
#./getDifferentialPeaksReplicates_narrow.pl -t p602_d8_D1_H3K4me3 -b p600_601_d12_D1_H3K4me3 -genome hg38 -use peaks.txt -style histone > p602_d8_vs_p600_601_d12_D1_H3K4me3_diff_peaks.txt
5.1. SICER peak calling (under env myperl)
# -- SICER --> bed --> annotatePeaks.pl
mkdir sicer; cd sicer;
ln -s ../results/picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bed .
ln -s ../results/picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bed .
ln -s ../results/picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bed .
ln -s ../results/picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bed .
ln -s ../results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bed .
ln -s ../results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bed .
ln -s ../results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bed .
ln -s ../results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bed .
#/home/jhuang/Tools/SICER1.1/SICER/SICER.sh [InputDir] [bed file] [control file] [OutputDir] [Species] [redundancy threshold] [window size (bp)] [fragment size] [effective genome fraction] [gap size (bp)] [FDR]
# 10000 is window size, 30000 is the gap size, 160 is the fragment size
mkdir p600_601_d12_D1 p600_601_d9_D2 p602_d8_D1 p602_d8_D2
~/Tools/SICER1.1/SICER/SICER.sh . V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bed V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bed p600_601_d12_D1 hg38 1 10000 160 0.74 30000 0.01;
~/Tools/SICER1.1/SICER/SICER.sh . V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bed V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bed p600_601_d9_D2 hg38 1 10000 160 0.74 30000 0.01;
~/Tools/SICER1.1/SICER/SICER.sh . V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bed V_8_4_2_p602_d8_D1_input.dedup.sorted.bed p602_d8_D1 hg38 1 10000 160 0.74 30000 0.01;
~/Tools/SICER1.1/SICER/SICER.sh . V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bed V_8_4_1_p602_d8_D2_input.dedup.sorted.bed p602_d8_D2 hg38 1 10000 160 0.74 30000 0.01;
#sicer -t treatment.bed -c control.bed -s hg38 -w 200 -rt 1 -f 150 -egf 0.74 -fdr 0.01 -g 600 -e 1000
#----------- to STEP 5.2-3. ----------->
#TODO: - check if the peak calling works well in IGV!
# - call peaks for H3K27me1, adjust the SICER-parameters!
# - transform them peaks.txt of HOMER, and call getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K27me3_sicer_peaks.txt
getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K27me3_sicer_peaks.txt
#Note that histone using 'cat file1.bed file2.bed | sort -k1,1 -k2,2n | bedtools merge > merged.bed'
#Note that factor using HOMER getDifferentialPeaksReplicates from begining: "mergePeaks.pl --> DESeq recheck"!
mergePeaks sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed -d 10000 > mergedPeaks1.txt
cat sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed | sort -k1,1 -k2,2n | bedtools merge > merged_p601_d8.bed
#4957 (4388, 4822)
awk 'BEGIN{OFS="\t"; print "PeakID\tchr\tstart\tend\tstrand"} {print "Peak-"NR, $1, $2, $3, "."}' merged_p601_d8.bed > homer_peaks.bed
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!!!!!!: The bedtools merge command does not generate output in the HOMER peak format directly. However, you can use a combination of Unix command line tools (e.g., awk, sort) to convert bedtools output into HOMER format.
#bedtools merge -i input.bed > merged.bed
#awk 'BEGIN{OFS="\t"; print "PeakID\tchr\tstart\tend\tstrand"} {print "Peak-"NR, $1, $2, $3, "."}' merged.bed > homer_peaks.bed
5.2-3. Convert your SICER peaks to HOMER-compatible format, and replace them in homer/${sample}/peaks.txt. You can do this manually or with a script.
- The following awk command can be used to convert more MACS2 information into the HOMER format:
#mv homer/p601_d8_D1_H3K27me3/regions.txt homer/p601_d8_D1_H3K27me3/regions_raw.txt
#mv homer/p601_d8_D2_H3K27me3/regions.txt homer/p601_d8_D2_H3K27me3/regions_raw.txt
#mv homer/p604_d8_D1_H3K27me3/regions.txt homer/p604_d8_D1_H3K27me3/regions_raw.txt
#mv homer/p604_d8_D2_H3K27me3/regions.txt homer/p604_d8_D2_H3K27me3/regions_raw.txt
#NOTE that we should name the file as peaks.txt, since the Input-samples contain only peaks.txt.
awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p600_601_d12_D1/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p600_601_d12_D1_H3K27me3/peaks.txt
awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p600_601_d9_D2/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p600_601_d9_D2_H3K27me3/peaks.txt
awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p602_d8_D1/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p602_d8_D1_H3K27me3/peaks.txt
awk 'BEGIN{OFS="\t"} {print $1"-"$2"-"$3,$1,$2,$3,"+",$4,0,0,$4,$5,0,$6,0}' sicer/p602_d8_D2/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p602_d8_D2_H3K27me3/peaks.txt
#The format is similar to the following.
#chr1-83000-89999 chr1 83000 89999 + 21 0 0 21 27 0 5.623763510806874e-10 0 0 0
#chr1-859000-865999 chr1 859000 865999 + 23 0 0 23 28 0 3.501056563888787e-11 0 0 0
This command will:
* PeakID: Format it as "chr-start-end".
* chr: Same as SICER.
* start: Same as SICER.
* end: Same as SICER.
* strand: SICER doesn't provide strand information, we could fill with a default value '+'.
* Normalized Tag Count: Same as the normalized read count column in SICER.
* focus ratio: SICER doesn't provide this, we could fill with a default value '0'.
* findPeaks Score: Corresponds to Fold Change or FDR column in SICER.
* Total Tags: SICER doesn't provide this, we could fill with a default value '0'.
* Control Tags (normalized to IP Experiment): Same as Clonal reads or local background column in SICER.
* Fold Change vs Control: SICER doesn't provide this, we could fill with a default value '0'.
* p-value vs Control: Same as the p-value column in SICER.
* Fold Change vs Local: SICER doesn't provide this, we could fill with a default value '0'.
* p-value vs Local: SICER doesn't provide this, we could fill with a default value '0'.
* Clonal Fold Change: SICER doesn't provide this, we could fill with a default value '0'.
5.4. The program getDifferentialPeaksReplicates_broad.pl will essentially perform 3 steps, in the step 2 was modified.
conda activate myperl
cd homer
#diff /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl /home/jhuang/homer/bin/getDifferentialPeaksReplicates.pl
#vim /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl #max_distance_to_merge at line 203 to 133000, "-d <#>"
#-- Successful modification of the script getDifferentialPeaksReplicates.pl (Explain how to archieve getDifferentialPeaksReplicates.pl, only for DEBUG, for calculation directly run getDifferentialPeaksReplicates.pl!) --
#The -d parameter in the mergePeaks function in HOMER is used to specify the maximum distance between peak centers
#change Max distance to merge to 30000 bp in getDifferentialPeaksReplicates.pl
#mergePeaks -d 30000 [] temp_sorted | sort
#conda list homer #4.11
mergePeaks p600_601_d12_D1_H3K27me3/peaks.txt p600_601_d9_D2_H3K27me3/peaks.txt > mergePeaks_res.txt
python3 update_header.py
cat p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt > merged_peaks.txt
awk '{print $2 "\t" $3 "\t" $4 "\t" $1}' merged_peaks.txt | sort -k1,1 -k2,2n | bedtools merge -d 1000 > bedtools_res.txt # get 36579 records using 'bedtools merge'
python3 adjust_mergePeaks_res.py # limiting the start- and end-positons of mergePeaks_res.txt to the ranges in bedtools_res.txt, last very long!
#check if the results are correct
cut -d$'\t' -f2-4 filtered_mergePeaks_res.txt > control1
diff control1 bedtools_res.txt
#NOTE that the script getDifferentialPeaksReplicates_broad.pl is modified in the step2; the script used "update_header.py" and "adjust_mergedPeaks_res.py"
#-d 35000
./getDifferentialPeaksReplicates_broad.pl -t p600_601_d12_D1_H3K27me3 p600_601_d9_D2_H3K27me3 -i p600_601_d12_D1_input p600_601_d9_D2_input -genome hg38 -use peaks.txt -style histone > p600_601_d9d12_H3K27me3_sicer_regions.txt
./getDifferentialPeaksReplicates_broad.pl -t p602_d8_D1_H3K27me3 p602_d8_D2_H3K27me3 -i p602_d8_D1_input p602_d8_D2_input -genome hg38 -use peaks.txt -style histone > p602_d8_H3K27me3_sicer_regions.txt
#p600_601_d12_D1_H3K27me3/peaks.txt p600_601_d9_D2_H3K27me3/peaks.txt Total Name
# X 1821 p600_601_d9_D2_H3K27me3/peaks.txt
#X 1306 p600_601_d12_D1_H3K27me3/peaks.txt
#X X 3981 p600_601_d12_D1_H3K27me3/peaks.txt|p600_601_d9_D2_H3K27me3/peaks.txt
#Using DESeq2 to calculate differential expression/enrichment...
#Output Stats input vs. target:
# Total Genes: 4431
# Total Up-regulated in target vs. input: 2502 (56.466%) [log2fold>1, FDR<0.05]
# Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
#
#p602_d8_D1_H3K27me3/peaks.txt p602_d8_D2_H3K27me3/peaks.txt Total Name
# X 1101 p602_d8_D2_H3K27me3/peaks.txt
#X 1602 p602_d8_D1_H3K27me3/peaks.txt
#X X 3668 p602_d8_D1_H3K27me3/peaks.txt|p602_d8_D2_H3K27me3/peaks.txt
#Using DESeq2 to calculate differential expression/enrichment...
#Output Stats input vs. target:
# Total Genes: 4216
# Total Up-regulated in target vs. input: 2113 (50.119%) [log2fold>1, FDR<0.05]
# Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
5.5. Draw plots
grep "p600_601_d12_D1_H3K27me3/peaks.txt|p600_601_d9_D2_H3K27me3/peaks.txt" p600_601_d9d12_H3K27me3_sicer_regions.txt | wc -l # 2128
grep "p600_601_d12_D1_H3K27me3/peaks.txt" p600_601_d9d12_H3K27me3_sicer_regions.txt | wc -l # 2162 - 2128 = 34
grep "p600_601_d9_D2_H3K27me3/peaks.txt" p600_601_d9d12_H3K27me3_sicer_regions.txt | wc -l # 2468 - 2128 = 340
# 340+34+2128 = 2502
grep "p602_d8_D1_H3K27me3/peaks.txt|p602_d8_D2_H3K27me3/peaks.txt" p602_d8_H3K27me3_sicer_regions.txt | wc -l # 1840
grep "p602_d8_D1_H3K27me3/peaks.txt" p602_d8_H3K27me3_sicer_regions.txt | wc -l # 1940 - 1840 = 100
grep "p602_d8_D2_H3K27me3/peaks.txt" p602_d8_H3K27me3_sicer_regions.txt | wc -l # 2013 - 1840 = 173
# 1840 + 100 + 173 = 2113
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
venn2(subsets=(34, 340, 2128), set_labels=('Donor 1', 'Donor 2'))
plt.title('Regions between p602 d8 H3K27me3 Donors')
plt.xlabel('Number of Elements')
plt.ylabel('')
plt.savefig('Region_Consistency_Between_p600+p601_d9d12_H3K27me3_Donors.png', dpi=300, bbox_inches='tight')
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
venn2(subsets=(100, 173, 1840), set_labels=('Donor 1', 'Donor 2'))
plt.title('Regions between p602 d8 H3K27me3 Donors')
plt.xlabel('Number of Elements')
plt.ylabel('')
plt.savefig('Region_Consistency_Between_p602_d8_H3K27me3_Donors.png', dpi=300, bbox_inches='tight')
awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p600_601_d9d12_H3K27me3_sicer_regions.txt > p600+p601_d9d12_H3K27me3_sicer_regions.bed
awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p602_d8_H3K27me3_sicer_regions.txt > p602_d8_H3K27me3_sicer_regions.bed
~/Tools/csv2xls-0.4/csv_to_xls.py p600_601_d9d12_H3K27me3_sicer_regions.txt -d$'\t' -o p600+p601_d9d12_H3K27me3_sicer_regions.xls
~/Tools/csv2xls-0.4/csv_to_xls.py p602_d8_H3K27me3_sicer_regions.txt -d$'\t' -o p602_d8_H3K27me3_sicer_regions.xls
5.6. identifying statistically significantly differentially bound sites based on evidence of binding affinity (measured by differences in read densities)
#- Donor I p602 (d8) vs p600+601 (d12) H3K27me3
#- Donor II p602 (d8) vs p600+601 (d9) H3K27me3
./getDifferentialPeaksReplicates_broad.pl -t p602_d8_D1_H3K27me3 p602_d8_D2_H3K27me3 -b p600_601_d12_D1_H3K27me3 p600_601_d9_D2_H3K27me3 -genome hg38 -use peaks.txt -style histone > p602_d8_vs_p600_601_d9d12_H3K27me3_regions.txt
7.0. LOGs: result examples using different -d values
# -- Max distance to merge: 35000 bp --
getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_using_mergePeaks_d35000.txt
Merging peaks...
Comparing p601_d8_D1_H3K4me3/peaks.txt (32453 total) and p601_d8_D1_H3K4me3/peaks.txt (32453 total)
Comparing p601_d8_D1_H3K4me3/peaks.txt (32453 total) and p601_d8_D2_H3K4me3/peaks.txt (37269 total)
Comparing p601_d8_D2_H3K4me3/peaks.txt (37269 total) and p601_d8_D1_H3K4me3/peaks.txt (32453 total)
Comparing p601_d8_D2_H3K4me3/peaks.txt (37269 total) and p601_d8_D2_H3K4me3/peaks.txt (37269 total)
p601_d8_D1_H3K4me3/peaks.txt p601_d8_D2_H3K4me3/peaks.txt Total Name
X 2830 p601_d8_D2_H3K4me3/peaks.txt
X 714 p601_d8_D1_H3K4me3/peaks.txt
X X 12786 p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt
Step2: Quantifying reads across target/background/input tag directories
Peak File Statistics:
Total Peaks: 16324
Step3: Calling R for differential enrichment statistics (-DESeq2)
Autodetecting input file format...
Autodetected annotatePeaks.pl file
Performing variance stabalization (rlog)...
Using DESeq2 to calculate differential expression/enrichment...
Output Stats input vs. target:
Total Genes: 16324
Total Up-regulated in target vs. input: 7193 (44.064%) [log2fold>1, FDR<0.05]
Total Dn-regulated in target vs. input: 1319 (8.080%) [log2fold<-1, FDR<0.05]
# -- Max distance to merge: 1000 bp --
getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_d1000.txt
getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks_d1000.txt
Step1: Defining Putative Peak Set
p601_d8_D1_H3K4me3/peaks.txt p601_d8_D2_H3K4me3/peaks.txt Total Name
X 10726 p601_d8_D2_H3K4me3/peaks.txt
X 4883 p601_d8_D1_H3K4me3/peaks.txt
X X 23478 p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt
Step3: Calling R for differential enrichment statistics (-DESeq2)
Using DESeq2 to calculate differential expression/enrichment...
Output Stats input vs. target:
Total Genes: 39087
Total Up-regulated in target vs. input: 28762 (73.585%) [log2fold>1, FDR<0.05]
Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
p604_d8_D1_H3K4me3/peaks.txt p604_d8_D2_H3K4me3/peaks.txt Total Name
X 9806 p604_d8_D2_H3K4me3/peaks.txt
X 4611 p604_d8_D1_H3K4me3/peaks.txt
X X 19586 p604_d8_D1_H3K4me3/peaks.txt|p604_d8_D2_H3K4me3/peaks.txt
Using DESeq2 to calculate differential expression/enrichment...
Output Stats input vs. target:
Total Genes: 34003
Total Up-regulated in target vs. input: 25135 (73.920%) [log2fold>1, FDR<0.05]
Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
# -- Max distance to merge: 500 bp --
vim /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl
getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_d500.txt
getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks_d500.txt
p601_d8_D1_H3K4me3/peaks.txt p601_d8_D2_H3K4me3/peaks.txt Total Name
X 13537 p601_d8_D2_H3K4me3/peaks.txt
X 8455 p601_d8_D1_H3K4me3/peaks.txt
X X 23009 p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt
Using DESeq2 to calculate differential expression/enrichment...
Output Stats input vs. target:
Total Genes: 45001
Total Up-regulated in target vs. input: 31206 (69.345%) [log2fold>1, FDR<0.05]
Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
p604_d8_D1_H3K4me3/peaks.txt p604_d8_D2_H3K4me3/peaks.txt Total Name
X 12775 p604_d8_D2_H3K4me3/peaks.txt
X 8797 p604_d8_D1_H3K4me3/peaks.txt
X X 18654 p604_d8_D1_H3K4me3/peaks.txt|p604_d8_D2_H3K4me3/peaks.txt
Using DESeq2 to calculate differential expression/enrichment...
Output Stats input vs. target:
Total Genes: 40226
Total Up-regulated in target vs. input: 27939 (69.455%) [log2fold>1, FDR<0.05]
Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
cp p601_d8_H3K4me3_macs2_peaks_d500.txt p601_d8_H3K4me3_macs2_peaks.txt
cp p604_d8_H3K4me3_macs2_peaks_d500.txt p604_d8_H3K4me3_macs2_peaks.txt
awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p601_d8_H3K4me3_macs2_peaks.txt > p601_d8_H3K4me3_macs2_peaks.bed
awk 'NR>1 {print $2 "\t" $3 "\t" $4 "\t" $1}' p604_d8_H3K4me3_macs2_peaks.txt > p604_d8_H3K4me3_macs2_peaks.bed
~/Tools/csv2xls-0.4/csv_to_xls.py p601_d8_H3K4me3_macs2_peaks.txt -d$'\t' -o p601_d8_H3K4me3_macs2_peaks.xls
~/Tools/csv2xls-0.4/csv_to_xls.py p604_d8_H3K4me3_macs2_peaks.txt -d$'\t' -o p604_d8_H3K4me3_macs2_peaks.xls
#----> Send to Denise!
# -- Max distance to merge: 100 bp --
getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_d100.txt
getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks_d100.txt
p601_d8_D1_H3K4me3/peaks.txt p601_d8_D2_H3K4me3/peaks.txt Total Name
X 25160 p601_d8_D2_H3K4me3/peaks.txt
X 20344 p601_d8_D1_H3K4me3/peaks.txt
X X 12109 p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt
Using DESeq2 to calculate differential expression/enrichment...
Output Stats input vs. target:
Total Genes: 57613
Total Up-regulated in target vs. input: 31179 (54.118%) [log2fold>1, FDR<0.05]
Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
p604_d8_D1_H3K4me3/peaks.txt p604_d8_D2_H3K4me3/peaks.txt Total Name
X 23330 p604_d8_D2_H3K4me3/peaks.txt
X 19855 p604_d8_D1_H3K4me3/peaks.txt
X X 8512 p604_d8_D1_H3K4me3/peaks.txt|p604_d8_D2_H3K4me3/peaks.txt
Using DESeq2 to calculate differential expression/enrichment...
Output Stats input vs. target:
Total Genes: 51697
Total Up-regulated in target vs. input: 28436 (55.005%) [log2fold>1, FDR<0.05]
Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05]
cp ./homer/p601_d8_D1_input/p601_d8_D1_input.ucsc.bigWig bigWigs
cp ./homer/p601_d8_D2_input/p601_d8_D2_input.ucsc.bigWig bigWigs
cp ./homer/p604_d8_D1_input/p604_d8_D1_input.ucsc.bigWig bigWigs
cp ./homer/p604_d8_D2_input/p604_d8_D2_input.ucsc.bigWig bigWigs
cp ./homer/p601_d8_D1_H3K4me3/p601_d8_D1_H3K4me3.ucsc.bigWig bigWigs
cp ./homer/p601_d8_D2_H3K4me3/p601_d8_D2_H3K4me3.ucsc.bigWig bigWigs
cp ./homer/p604_d8_D1_H3K4me3/p604_d8_D1_H3K4me3.ucsc.bigWig bigWigs
cp ./homer/p604_d8_D2_H3K4me3/p604_d8_D2_H3K4me3.ucsc.bigWig bigWigs
cp ./homer/p601_d8_D1_H3K27me3/p601_d8_D1_H3K27me3.ucsc.bigWig bigWigs
cp ./homer/p601_d8_D2_H3K27me3/p601_d8_D2_H3K27me3.ucsc.bigWig bigWigs
cp ./homer/p604_d8_D1_H3K27me3/p604_d8_D1_H3K27me3.ucsc.bigWig bigWigs
cp ./homer/p604_d8_D2_H3K27me3/p604_d8_D2_H3K27me3.ucsc.bigWig bigWigs
#----> Send to Denise!
cp ./homer/p601_d8_D1_H3K27ac/p601_d8_D1_H3K27ac.ucsc.bigWig bigWigs
cp ./homer/p601_d8_D2_H3K27ac/p601_d8_D2_H3K27ac.ucsc.bigWig bigWigs
cp ./homer/p604_d8_D1_H3K27ac/p604_d8_D1_H3K27ac.ucsc.bigWig bigWigs
cp ./homer/p604_d8_D2_H3K27ac/p604_d8_D2_H3K27ac.ucsc.bigWig bigWigs
getDifferentialPeaksReplicates_broad.pl (code)
#!/usr/bin/env perl
use warnings;
use lib "/home/jhuang/Tools/homer/bin";
my $homeDir = "/home/jhuang/Tools/homer/";
my $foldThresh = 2;
my $fdrThresh = 0.05;
my $peakFoldInput = 2;
my $peakFdrInput = 0.001;
my $style = "";
sub printCMD {
print STDERR "\n\tUsage: getDifferentialPeaksReplicates.pl [options] -t <IP tagdir1> [IP tagdir2] ...\n";
print STDERR "\t -b <background tagdir1> [background tagdir2] ...\n";
print STDERR "\t -i <Input tagdir1> [Input tagdir2] ...\n";
print STDERR "\t\tNote: if input is provided, peaks will be called.\n";
print STDERR "\n\tOptions:\n";
#print STDERR "\t\t-F <#> (fold enrichment over bg, default: $foldThresh)\n";
#print STDERR "\t\t-fdr <#> (FDR over input, default: $fdrThresh)\n";
print STDERR "\t\t-f <#> (fold enrichment over bg, default: $foldThresh)\n";
print STDERR "\t\t-q <#> (FDR over bg, default: $fdrThresh)\n";
print STDERR "\t\t-fdr <#>, -F <#>, -L <#> (parameters for findPeaks)\n";
print STDERR "\t\t-genome <genome version> (genome version to use for annotation)\n";
print STDERR "\t\t-DESeq2 | -DESeq | -edgeR (differential stats algorithm, default: DESeq2)\n";
print STDERR "\t\t-balanced (normalize signal across peaks, default: normalize to read totals)\n";
print STDERR "\t\t-fragLength <#> (standardize estimated fragment length across analysis)\n";
print STDERR "\t\t-all (report all peaks, not just differentially regulated)\n";
print STDERR "\n\tPeak finding directives:\n";
print STDERR "\t\t-style <factor|histone|tss> (findPeaks style to use for finding peaks)\n";
print STDERR "\t\t-use <peaks.txt|regions.txt|tss.txt> (use existing peaks in tag directories)\n";
print STDERR "\t\t-p <peak file> (use specific peak file instead of tagDir/peaks.txt or finding new one)\n";
print STDERR "\t\tOther options will be passed to findPeaks\n";
print STDERR "\n";
exit;
}
my @targets = ();
my @background = ();
my @inputs = ();
my $findPeaksOpts = "";
my $use = "";
my $givenPeakFile = '';
my $norm2total = "-norm2total";
my $diffAlg = "-DESeq2";
my $genome = 'none';
my $annOptions = "";
my $fragLength = '';
my $allFlag = 0;
my $ogCmd = "getDifferentialPeaksReplicates.pl";
for (my $i=0;$i<@ARGV;$i++) {
$ogCmd .= " " . $ARGV[$i];
}
for (my $i=0;$i<@ARGV;$i++) {
if ($ARGV[$i] eq '-t') {
$i++;
while ($i < @ARGV) {
if ($ARGV[$i] =~ /^-/) {
$i--;
last;
}
push(@targets, $ARGV[$i++]);
}
} elsif ($ARGV[$i] eq '-i') {
$i++;
while ($i < @ARGV) {
if ($ARGV[$i] =~ /^-/) {
$i--;
last;
}
push(@inputs, $ARGV[$i++]);
}
} elsif ($ARGV[$i] eq '-b') {
$i++;
while ($i < @ARGV) {
if ($ARGV[$i] =~ /^-/) {
$i--;
last;
}
push(@background, $ARGV[$i++]);
}
} elsif ($ARGV[$i] eq '-f') {
$foldThresh = $ARGV[++$i];
} elsif ($ARGV[$i] eq '-genome') {
$genome = $ARGV[++$i];
} elsif ($ARGV[$i] eq '-q') {
$fdrThresh = $ARGV[++$i];
} elsif ($ARGV[$i] eq '-use') {
$use = $ARGV[++$i];
if ($use eq 'tss.txt') {
$annOptions .= " -fragLength 1 -strand +";
$fragLength = " -fragLength 1" if ($fragLength eq '');
}
} elsif ($ARGV[$i] eq '-p') {
$givenPeakFile = $ARGV[++$i];
} elsif ($ARGV[$i] eq '-style') {
$style = $ARGV[++$i];
if ($style eq 'tss') {
$annOptions .= " -fragLength 1 -strand +";
$fragLength = " -fragLength 1" if ($fragLength eq '');
}
} elsif ($ARGV[$i] eq '-edgeR') {
$diffAlg = $ARGV[$i];
} elsif ($ARGV[$i] eq '-fragLength') {
$fragLength = " -fragLength $ARGV[++$i]";
} elsif ($ARGV[$i] eq '-DESeq2') {
$diffAlg = $ARGV[$i];
} elsif ($ARGV[$i] eq '-all') {
$allFlag = 1;
} elsif ($ARGV[$i] eq '-DESeq') {
$diffAlg = $ARGV[$i];
} elsif ($ARGV[$i] eq '-balanced') {
$norm2total = "";
} elsif ($ARGV[$i] eq '-h' || $ARGV[$i] eq '--help' || $ARGV[$i] eq '--') {
printCMD();
} else {
$findPeaksOpts .= " " . $ARGV[$i];
#print STDERR "!!! \"$ARGV[$i]\" not recognized\n";
#printCMD();
}
}
my $rand = rand();
my %toDelete = ();
if ($diffAlg eq '-edgeR' && $norm2total ne '') {
print STDERR "!!! Error, -edgeR requires \"-balanced\" to work correctly!!!\n";
exit;
}
$log2Thresh = log($foldThresh)/log(2);
if (@targets < 1) {
print STDERR "!!! Error, need at least one target directory!!!\n";
printCMD();
}
if (scalar(@inputs) + scalar(@background) < 1) {
print STDERR "\t!!! Error: program requires either input or background experiments to perform differential calculations!\n";
exit;
}
my $targetDirs = '';
my $targetStr = "";
foreach(@targets) {
$targetDirs .= " \"$_\"";
$targetStr .= " target";
}
my $inputDirs = '';
my $inputStr = "";
foreach(@inputs) {
$inputDirs .= " \"$_\"";
$inputStr .= " input";
}
my $bgDirs = '';
my $bgStr = "";
foreach(@background) {
$bgDirs .= " \"$_\"";
$bgStr .= " bg";
}
if ($givenPeakFile eq '') {
$peakFile = $rand . ".peaks";
$toDelete{$peakFile}=1;
} else {
$peakFile = $givenPeakFile;
}
if ($findPeaksOpts ne '') {
print STDERR "\tUsing the following extra parameters for findPeaks:\n\t\t$findPeaksOpts\n";
}
print STDERR "\tStep1: Defining Putative Peak Set\n";
if ($use eq '' && $givenPeakFile eq '') {
print STDERR "\t\tFinding peaks in merged meta-experiment from target tag directories\n";
my $targetDir = $rand . ".targetTagDir";
`makeTagDirectory \"$targetDir\" -d $targetDirs $fragLength`;
$toDelete{$targetDir}=1;
my $inputDir = $rand . ".inputTagDir";
my $cmd = "findPeaks \"$targetDir\"";
if ($style eq '') {
$style = 'factor';
print STDERR "\tUsing -style $style...\n";
}
$cmd .= " -style $style";
$cmd .= $fragLength . " " . $findPeaksOpts;
if (@inputs > 0) {
`makeTagDirectory \"$inputDir\" -d $inputDirs $fragLength`;
$toDelete{$inputDir}=1;
$cmd .= " -i \"$inputDir\"";
}
#print STDERR "`$cmd > \"$peakFile\"`\n";
`$cmd > \"$peakFile\"`;
} elsif ($givenPeakFile eq '' && $use ne '') {
my $files = "";
my @allDirs = ();
push(@allDirs, @targets, @inputs, @background);
print STDERR "\t\tUsing existing peak files for features:\n";
foreach(@allDirs) {
my $p = $_ . "/" . $use;
if (-e $p) {
print STDERR "\t\t\t$p\n";
$files .= " \"$p\"";
}
}
#MODIFIED
#`mergePeaks $files > \"$peakFile\"`;
#print "$files\n";
#print "\"$peakFile\"\n";
`mergePeaks $files -d 35000 > mergePeaks_res.txt`;
`python3 update_header.py`;
`cat $files > merged_peaks.txt`;
## Split the list of files into an array
#my @files = split / /, $files;
# Open the output file
#open(my $out, '>', 'merged_peaks.txt') or die "Cannot open merged_peaks.txt: $!";
#foreach my $file (@files) {
# print("xxxxx$file\n");
# open(my $in, '<', $file) or die "Cannot open $file: $!";
# while (<$in>) {
# s/\t+$//; # removes trailing tabs
# print $out "$_\n";
# }
# close $in;
#}
#close $out;
system("awk '{print \$2 \"\\t\" \$3 \"\\t\" \$4 \"\\t\" \$1}' merged_peaks.txt | sort -k1,1 -k2,2n | bedtools merge -d 35000 > bedtools_res.txt");
`python3 adjust_mergePeaks_res.py`;
`mv filtered_mergePeaks_res.txt \"$peakFile\"`;
}
$rawFile= $rand . ".raw.txt";
$diffFile= $rand . ".diff.txt";
$upFile = $rand . ".Up_target_vs_bg.txt";
$downFile = $rand . ".Down_target_vs_bg.txt";
if ($bgDirs eq '') {
$bgDirs = $inputDirs;
$bgStr = $inputStr;
@background = @inputs;
$upFile = $rand . ".Up_target_vs_input.txt";
$downFile = $rand . ".Down_target_vs_input.txt";
}
print STDERR "\n\tStep2: Quantifying reads across target/background/input tag directories\n\n";
#print STDERR "`annotatePeaks.pl \"$peakFile\" none -d $bgDirs $targetDirs -raw > \"$rawFile\"`;\n";
`annotatePeaks.pl \"$peakFile\" $genome -d $bgDirs $targetDirs -raw $annOptions $fragLength > \"$rawFile\"`;
#print STDERR "`getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;\n";
#
print STDERR "\n\tStep3: Calling R for differential enrichment statistics ($diffAlg)\n\n";
`getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;
$toDelete{$rawFile}=1;
$toDelete{$diffFile}=1;
$toDelete{$upFile}=1;
$toDelete{$downFile}=1;
print "#cmd=$ogCmd|";
my $ofile = $upFile;
$ofile = $diffFile if ($allFlag);
open IN, $ofile;
while (<IN>) {
print $_;
}
close IN;
foreach(keys %toDelete) {
next if ($_ eq '/');
#print STDERR "\trm -r \"$_\"\n";
`rm -r "$_"`;
}
getDifferentialPeaksReplicates_narrow.pl (code)
#!/usr/bin/env perl
use warnings;
use lib "/home/jhuang/anaconda3/envs/myperl/share/homer/.//bin";
my $homeDir = "/home/jhuang/anaconda3/envs/myperl/share/homer/./";
my $foldThresh = 2;
my $fdrThresh = 0.05;
my $peakFoldInput = 2;
my $peakFdrInput = 0.001;
my $style = "";
sub printCMD {
print STDERR "\n\tUsage: getDifferentialPeaksReplicates.pl [options] -t <IP tagdir1> [IP tagdir2] ...\n";
print STDERR "\t -b <background tagdir1> [background tagdir2] ...\n";
print STDERR "\t -i <Input tagdir1> [Input tagdir2] ...\n";
print STDERR "\t\tNote: if input is provided, peaks will be called.\n";
print STDERR "\n\tOptions:\n";
#print STDERR "\t\t-F <#> (fold enrichment over bg, default: $foldThresh)\n";
#print STDERR "\t\t-fdr <#> (FDR over input, default: $fdrThresh)\n";
print STDERR "\t\t-f <#> (fold enrichment over bg, default: $foldThresh)\n";
print STDERR "\t\t-q <#> (FDR over bg, default: $fdrThresh)\n";
print STDERR "\t\t-fdr <#>, -F <#>, -L <#> (parameters for findPeaks)\n";
print STDERR "\t\t-genome <genome version> (genome version to use for annotation)\n";
print STDERR "\t\t-DESeq2 | -DESeq | -edgeR (differential stats algorithm, default: DESeq2)\n";
print STDERR "\t\t-balanced (normalize signal across peaks, default: normalize to read totals)\n";
print STDERR "\t\t-fragLength <#> (standardize estimated fragment length across analysis)\n";
print STDERR "\t\t-all (report all peaks, not just differentially regulated)\n";
print STDERR "\n\tPeak finding directives:\n";
print STDERR "\t\t-style <factor|histone|tss> (findPeaks style to use for finding peaks)\n";
print STDERR "\t\t-use <peaks.txt|regions.txt|tss.txt> (use existing peaks in tag directories)\n";
print STDERR "\t\t-p <peak file> (use specific peak file instead of tagDir/peaks.txt or finding new one)\n";
print STDERR "\t\tOther options will be passed to findPeaks\n";
print STDERR "\n";
exit;
}
my @targets = ();
my @background = ();
my @inputs = ();
my $findPeaksOpts = "";
my $use = "";
my $givenPeakFile = '';
my $norm2total = "-norm2total";
my $diffAlg = "-DESeq2";
my $genome = 'none';
my $annOptions = "";
my $fragLength = '';
my $allFlag = 0;
my $ogCmd = "getDifferentialPeaksReplicates.pl";
for (my $i=0;$i<@ARGV;$i++) {
$ogCmd .= " " . $ARGV[$i];
}
for (my $i=0;$i<@ARGV;$i++) {
if ($ARGV[$i] eq '-t') {
$i++;
while ($i < @ARGV) {
if ($ARGV[$i] =~ /^-/) {
$i--;
last;
}
push(@targets, $ARGV[$i++]);
}
} elsif ($ARGV[$i] eq '-i') {
$i++;
while ($i < @ARGV) {
if ($ARGV[$i] =~ /^-/) {
$i--;
last;
}
push(@inputs, $ARGV[$i++]);
}
} elsif ($ARGV[$i] eq '-b') {
$i++;
while ($i < @ARGV) {
if ($ARGV[$i] =~ /^-/) {
$i--;
last;
}
push(@background, $ARGV[$i++]);
}
} elsif ($ARGV[$i] eq '-f') {
$foldThresh = $ARGV[++$i];
} elsif ($ARGV[$i] eq '-genome') {
$genome = $ARGV[++$i];
} elsif ($ARGV[$i] eq '-q') {
$fdrThresh = $ARGV[++$i];
} elsif ($ARGV[$i] eq '-use') {
$use = $ARGV[++$i];
if ($use eq 'tss.txt') {
$annOptions .= " -fragLength 1 -strand +";
$fragLength = " -fragLength 1" if ($fragLength eq '');
}
} elsif ($ARGV[$i] eq '-p') {
$givenPeakFile = $ARGV[++$i];
} elsif ($ARGV[$i] eq '-style') {
$style = $ARGV[++$i];
if ($style eq 'tss') {
$annOptions .= " -fragLength 1 -strand +";
$fragLength = " -fragLength 1" if ($fragLength eq '');
}
} elsif ($ARGV[$i] eq '-edgeR') {
$diffAlg = $ARGV[$i];
} elsif ($ARGV[$i] eq '-fragLength') {
$fragLength = " -fragLength $ARGV[++$i]";
} elsif ($ARGV[$i] eq '-DESeq2') {
$diffAlg = $ARGV[$i];
} elsif ($ARGV[$i] eq '-all') {
$allFlag = 1;
} elsif ($ARGV[$i] eq '-DESeq') {
$diffAlg = $ARGV[$i];
} elsif ($ARGV[$i] eq '-balanced') {
$norm2total = "";
} elsif ($ARGV[$i] eq '-h' || $ARGV[$i] eq '--help' || $ARGV[$i] eq '--') {
printCMD();
} else {
$findPeaksOpts .= " " . $ARGV[$i];
#print STDERR "!!! \"$ARGV[$i]\" not recognized\n";
#printCMD();
}
}
my $rand = rand();
my %toDelete = ();
if ($diffAlg eq '-edgeR' && $norm2total ne '') {
print STDERR "!!! Error, -edgeR requires \"-balanced\" to work correctly!!!\n";
exit;
}
$log2Thresh = log($foldThresh)/log(2);
if (@targets < 1) {
print STDERR "!!! Error, need at least one target directory!!!\n";
printCMD();
}
if (scalar(@inputs) + scalar(@background) < 1) {
print STDERR "\t!!! Error: program requires either input or background experiments to perform differential calculations!\n";
exit;
}
my $targetDirs = '';
my $targetStr = "";
foreach(@targets) {
$targetDirs .= " \"$_\"";
$targetStr .= " target";
}
my $inputDirs = '';
my $inputStr = "";
foreach(@inputs) {
$inputDirs .= " \"$_\"";
$inputStr .= " input";
}
my $bgDirs = '';
my $bgStr = "";
foreach(@background) {
$bgDirs .= " \"$_\"";
$bgStr .= " bg";
}
if ($givenPeakFile eq '') {
$peakFile = $rand . ".peaks";
$toDelete{$peakFile}=1;
} else {
$peakFile = $givenPeakFile;
}
if ($findPeaksOpts ne '') {
print STDERR "\tUsing the following extra parameters for findPeaks:\n\t\t$findPeaksOpts\n";
}
print STDERR "\tStep1: Defining Putative Peak Set\n";
if ($use eq '' && $givenPeakFile eq '') {
print STDERR "\t\tFinding peaks in merged meta-experiment from target tag directories\n";
my $targetDir = $rand . ".targetTagDir";
`makeTagDirectory \"$targetDir\" -d $targetDirs $fragLength`;
$toDelete{$targetDir}=1;
my $inputDir = $rand . ".inputTagDir";
my $cmd = "findPeaks \"$targetDir\"";
if ($style eq '') {
$style = 'factor';
print STDERR "\tUsing -style $style...\n";
}
$cmd .= " -style $style";
$cmd .= $fragLength . " " . $findPeaksOpts;
if (@inputs > 0) {
`makeTagDirectory \"$inputDir\" -d $inputDirs $fragLength`;
$toDelete{$inputDir}=1;
$cmd .= " -i \"$inputDir\"";
}
#print STDERR "`$cmd > \"$peakFile\"`\n";
`$cmd > \"$peakFile\"`;
} elsif ($givenPeakFile eq '' && $use ne '') {
my $files = "";
my @allDirs = ();
push(@allDirs, @targets, @inputs, @background);
print STDERR "\t\tUsing existing peak files for features:\n";
foreach(@allDirs) {
my $p = $_ . "/" . $use;
if (-e $p) {
print STDERR "\t\t\t$p\n";
$files .= " \"$p\"";
}
}
`mergePeaks -d 2000 $files > \"$peakFile\"`;
#`cat $files | sort -k1,1 -k2,2n | bedtools merge > \"$peakFile\"`;
}
$rawFile= $rand . ".raw.txt";
$diffFile= $rand . ".diff.txt";
$upFile = $rand . ".Up_target_vs_bg.txt";
$downFile = $rand . ".Down_target_vs_bg.txt";
if ($bgDirs eq '') {
$bgDirs = $inputDirs;
$bgStr = $inputStr;
@background = @inputs;
$upFile = $rand . ".Up_target_vs_input.txt";
$downFile = $rand . ".Down_target_vs_input.txt";
}
print STDERR "\n\tStep2: Quantifying reads across target/background/input tag directories\n\n";
#print STDERR "`annotatePeaks.pl \"$peakFile\" none -d $bgDirs $targetDirs -raw > \"$rawFile\"`;\n";
`annotatePeaks.pl \"$peakFile\" $genome -d $bgDirs $targetDirs -raw $annOptions $fragLength > \"$rawFile\"`;
#print STDERR "`getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;\n";
#
print STDERR "\n\tStep3: Calling R for differential enrichment statistics ($diffAlg)\n\n";
`getDiffExpression.pl \"$rawFile\" $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;
$toDelete{$rawFile}=1;
$toDelete{$diffFile}=1;
$toDelete{$upFile}=1;
$toDelete{$downFile}=1;
print "#cmd=$ogCmd|";
my $ofile = $upFile;
$ofile = $diffFile if ($allFlag);
open IN, $ofile;
while (<IN>) {
print $_;
}
close IN;
#foreach(keys %toDelete) {
# next if ($_ eq '/');
# #print STDERR "\trm -r \"$_\"\n";
# `rm -r "$_"`;
#}
update_header.py (code)
import os
# File path
file_path = 'mergePeaks_res.txt'
# Read in the file
with open(file_path, 'r') as file:
lines = file.readlines()
# Split the first line into components
components = lines[0].split("\t")
# Change the first component to "name"
components[0] = "name"
# Join the components back into a single string
lines[0] = "\t".join(components)
# Write the file back out
with open(file_path, 'w') as file:
file.writelines(lines)
adjust_mergePeaks_res.py (code)
import pandas as pd
# Read the mergePeaks result file and bedtools result file
mergePeaks_df = pd.read_csv('mergePeaks_res.txt', sep='\t', header=0)
bedtools_df = pd.read_csv('bedtools_res.txt', sep='\t', header=None, names=['chr', 'start', 'end'])
# Function to check if a peak is within bedtools ranges
# def is_in_bedtools_range(row):
filtered_mergePeaks_df = []
for _, row in mergePeaks_df.iterrows():
for _, bed_row in bedtools_df.iterrows():
if row['chr'] == bed_row['chr'] and row['start'] >= bed_row['start'] and row['end'] <= bed_row['end']:
row['start'] = bed_row['start']
row['end'] = bed_row['end']
filtered_mergePeaks_df.append(row)
# Convert the filtered results to a DataFrame
filtered_mergePeaks_df = pd.DataFrame(filtered_mergePeaks_df)
## Write the filtered results to a file
#filtered_mergePeaks_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False)
## Filter the mergePeaks rows
#filtered_mergePeaks_df = mergePeaks_df[mergePeaks_df.apply(is_in_bedtools_range, axis=1)]
## Sort and drop duplicates
sorted_df = filtered_mergePeaks_df.sort_values(by=['chr', 'start', 'end'])
deduplicated_df = sorted_df.drop_duplicates(subset=['chr', 'start', 'end'])
## Save to file
deduplicated_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False)
点赞本文的读者
还没有人对此文章表态
没有评论
Peak calling using homer combining sicer and macs2
Peak and Motif analyses in Promoters
How to use H3K27ac, H3K4me1, and RNA-seq to identify enhancers and their target genes?
© 2023 XGenes.com Impressum