Small RNA sequencing processing in the example of smallRNA_7

gene_x 0 like s 1115 view s

Tags: python, R, bash, pipeline

adapter sequence

Lexogen small RNA-Seq kit

some common adapter sequences from different kits for reference:

    - TruSeq Small RNA (Illumina): TGGAATTCTCGGGTGCCAAGG
    - Small RNA Kits V1 (Illumina): TCGTATGCCGTCTTCTGCTTGT
    - Small RNA Kits V1.5 (Illumina): ATCTCGTATGCCGTCTTCTGCTTG
    - NEXTflex Small RNA Sequencing Kit v3 for Illumina Platforms (Bioo Scientific): TGGAATTCTCGGGTGCCAAGG
    - LEXOGEN Small RNA-Seq Library Prep Kit (Illumina): TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC

[Header],,,,,
IEMFileVersion,4,,,,
InvestigatorName,ag96,,,,
ExperimentName,ag96,,,,
Date,16.10.2023,,,,
Workflow,GenerateFASTQ,,,,
Application,NextSeqFASTQOnly,,,,
Assay,TruSeq HT,,,,
Description,pcr,,,,
Chemistry,Amplicon,,,,
,,,,,
[Reads],,,,,
82,,,,,
,,,,,
,,,,,
[Settings],,,,,
Adapter,TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC,,,,
,,,,,
,,,,,
[Data],,,,,
Sample_ID,Sample_Name,I7_Index_ID,index,Sample_Project,Description
nf930,01_0505_WaGa_wt_EV_RNA,SRi7001,CAGCGT,2023_064_nf_ute,smallRNA-Seq
nf931,02_0505_WaGa_sT_DMSO_EV_RNA,SRi7002,GATCAC,2023_064_nf_ute,smallRNA-Seq
nf932,03_0505_WaGa_sT_Dox_EV_RNA,SRi7003,ACCAGT,2023_064_nf_ute,smallRNA-Seq
nf933,04_0505_WaGa_scr_DMSO_EV_RNA,SRi7004,TGCACG,2023_064_nf_ute,smallRNA-Seq
nf934,05_0505_WaGa_scr_Dox_EV_RNA,SRi7005,ACATTA,2023_064_nf_ute,smallRNA-Seq
nf935,06_1905_WaGa_wt_EV_RNA,SRi7006,GTGTAG,2023_064_nf_ute,smallRNA-Seq
nf936,07_1905_WaGa_sT_DMSO_EV_RNA,SRi7007,CTAGTC,2023_064_nf_ute,smallRNA-Seq
nf937,08_1905_WaGa_sT_Dox_EV_RNA,SRi7008,TGTGCA,2023_064_nf_ute,smallRNA-Seq
nf938,09_1905_WaGa_scr_DMSO_EV_RNA,SRi7009,TCAGGA,2023_064_nf_ute,smallRNA-Seq
nf939,10_1905_WaGa_scr_Dox_EV_RNA,SRi7010,CGGTTA,2023_064_nf_ute,smallRNA-Seq
nf940,11_control_MKL1,SRi7011,TTAACT,2023_064_nf_ute,smallRNA-Seq
nf941,12_control_WaGa,SRi7012,ATGAAC,2023_064_nf_ute,smallRNA-Seq

input data

ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf930/01_0505_WaGa_wt_EV_RNA_S1_R1_001.fastq.gz         0505_WaGa_wt.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf931/02_0505_WaGa_sT_DMSO_EV_RNA_S2_R1_001.fastq.gz    0505_WaGa_sT_DMSO.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf932/03_0505_WaGa_sT_Dox_EV_RNA_S3_R1_001.fastq.gz     0505_WaGa_sT_Dox.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf933/04_0505_WaGa_scr_DMSO_EV_RNA_S4_R1_001.fastq.gz   0505_WaGa_scr_DMSO.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf934/05_0505_WaGa_scr_Dox_EV_RNA_S5_R1_001.fastq.gz    0505_WaGa_scr_Dox.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf935/06_1905_WaGa_wt_EV_RNA_S6_R1_001.fastq.gz         1905_WaGa_wt.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf936/07_1905_WaGa_sT_DMSO_EV_RNA_S7_R1_001.fastq.gz    1905_WaGa_sT_DMSO.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf937/08_1905_WaGa_sT_Dox_EV_RNA_S8_R1_001.fastq.gz     1905_WaGa_sT_Dox.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf938/09_1905_WaGa_scr_DMSO_EV_RNA_S9_R1_001.fastq.gz   1905_WaGa_scr_DMSO.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf939/10_1905_WaGa_scr_Dox_EV_RNA_S10_R1_001.fastq.gz   1905_WaGa_scr_Dox.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf940/11_control_MKL1_S11_R1_001.fastq.gz               control_MKL1.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf941/12_control_WaGa_S12_R1_001.fastq.gz               control_WaGa.fastq.gz

run cutadapt

for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
  cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 -o ${sample}_cutadapted.fastq.gz --minimum-length 5 --trim-n ${sample}.fastq.gz >> LOG
done
#jhuang@hamburg:~/DATA/Data_Ute/Data_Ute_smallRNA_7$ fastp -i 0505_WaGa_wt.fastq.gz -o 0505_WaGa_wt3.fastq.gz -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC

run COMPSRA

ln -s ../Data_Ute_smallRNA_3/bundle_v1 .
#change the reference of 'hg38 -> hg38_orig' to 'hg38 -> hg38_miRNA_JN707599' (other options are hg38_WaGa_KJ128379 and hg38_MKL1_FJ173815)
cd bundle_v1/db/stars; rm hg38; ln -s hg38_miRNA_JN707599 hg38;

# DEBUG_1: Make sure the file COMPSRA.jar under Data_Ute_smallRNA_7
# DEBUG_2: "-qc -ra TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -rb 4" does not work! Using cutadapt -a xxxx -q 20 replace those parameters!

mkdir out_out
for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
  echo "mkdir our_out/${sample}_cutadapted/"
done
for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
  echo "java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in ${sample}_cutadapted.fastq.gz -out ./our_out/"
done

java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_wt_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_sT_DMSO_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_sT_Dox_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_scr_DMSO_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_scr_Dox_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_wt_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_sT_DMSO_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_sT_Dox_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_scr_DMSO_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_scr_Dox_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in control_MKL1_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in control_WaGa_cutadapted.fastq.gz -out ./our_out/
#END

#4.2.3 -rb/-rm_bias n
#To remove n random bases in both 5’ (5-prime) and 3’ (3-prime) ends after removing the adapter sequence.
#4.2.4 -rh/-rm_low_quality_head score
#To remove the low quality bases with the score less than score from 5’ (5-prime) end.
#4.2.5 -rt/-rm_low_quality_tail score
#To remove the low quality bases with the score less than score from 3’ (3-prime) end.
#4.2.6 -rr/-rm_low_quality_read score
#To remove the low quality reads with the average score less than score.
#4.6.3 -fdclass/-fun_diff_class A1,A2,...,An
#To set the small RNAs that will be performed the differential expression analysis. The format is the same as the parameter -ac/-ann_class A1,A2,...,An.
#4.6.4 -fdcase/-fun_diff_case ID1,ID2,...,IDn
#To set the IDs of case samples.
#4.6.5 -fdctrl/-fun_diff_control ID1,ID2,...,IDn
#To set the IDs of control samples.
#4.4.2 -ac/-ann_class A1,A2,...,An
#To set the small RNA categories that will be annotated. The index of small RNA is listed:
#    1 miRNA
#    2 piRNA
#    3 tRNA
#    4 snoRNA
#    5 snRNA
#    6 circRNA

java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-5 -fdclass 1,2,3,4,5 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5 -fdcase 1-2 -fdctrl 3-6 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/

get the read number of virus genome (see also ~/DATA/Data_Ute/Data_Ute_smallRNA_3/README_virusgenome)

#rsync -a -P jhuang@newton.ibvt.uni-stuttgart.de:~/COMPSRA_V1.0.2/our_out/pfsk1_mcvsyn_TSQ ./
#!!NOTE that the used COMPSRA.jar --> Data_Ute_smallRNA_3/COMPSRA_V1.0.2, the database (changable) --> Data_Ute_smallRNA_3/bundle_v1!

#sample.list
#0505_WaGa_sT_DMSO
#1905_WaGa_sT_DMSO
#0505_WaGa_sT_Dox
#1905_WaGa_sT_Dox
#0505_WaGa_scr_DMSO
#1905_WaGa_scr_DMSO
#0505_WaGa_scr_Dox
#1905_WaGa_scr_Dox
#0505_WaGa_wt
#1905_WaGa_wt
#control_MKL1
#control_WaGa

for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
  mv ${sample}_cutadapted ${sample}
done

for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
    cd ${sample}
    #samtools sort ${sample}_cutadapted_STAR_Aligned.out.bam > sorted.bam
    #samtools index sorted.bam
    #samtools view -h sorted.bam "JN707599:1-5387" > JN707599.sam
    #samtools view -Sb JN707599.sam > JN707599.bam
    #samtools view -h sorted.bam "JN707599:1-5387" | samtools view -Sb - > JN707599.bam
    samtools view -h sorted.bam | grep -E 'JN707599' | samtools view -Sb - > JN707599.bam
    #samtools view -h sorted.bam | grep -E '^@|chr' | samtools view -b > hg38.bam
    cd ..
done

# #https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
# #https://www.researchgate.net/post/How_to_find_number_of_reads_aligned_to_a_particular_region_of_a_chromosome_from_sam_file
# echo -e 'JN707599\t4347\t4368' > region_5p.bed
# intersectBed -abam 0505_WaGa_sT_Dox_cutadapted_STAR_Aligned.out.bam -b region_5p.bed -bed | wc -l
# #382487    -->  382487
# echo -e 'JN707599\t4381\t4402' > region_3p.bed
# intersectBed -abam 0505_WaGa_sT_Dox_cutadapted_STAR_Aligned.out.bam -b region_3p.bed -bed | wc -l
# #172659
# #Total = 382487+172659=555146   
# echo -e 'JN707599\t4332\t4415' > region_primary.bed
# intersectBed -abam JN707599.bam -b region_primary.bed -bed | wc -l
# #NOTE that 556752 > 555146

for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
  echo "---- ${sample} ----" >> LOG_JN707599_M1
  cd ${sample}
  intersectBed -abam JN707599.bam -b ../region_all.bed -bed | wc -l >> ../LOG_JN707599_M1
  intersectBed -abam JN707599.bam -b ../region_primary.bed -bed | wc -l >> ../LOG_JN707599_M1
  cd ..
done

for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
  echo "---- ${sample} ----" >> LOG_read_number
  zcat ${sample}2.fastq.gz | echo $((`wc -l`/4)) >> LOG_read_number
done

cd our_out_on_hg38+JN707599
for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
  echo "---- ${sample} ----" >> LOG_hg38
  echo "---- ${sample} ----" >> LOG_JN707599
  echo "---- ${sample} ----" >> LOG_hg38_alignments
  echo "---- ${sample} ----" >> LOG_JN707599_alignments
  cd ${sample}
  #-F 260: Excludes secondary alignments (flag 256) and unmapped reads (flag 4).
  samtools view -c -F 260 sorted.bam `seq -f "chr%g" 1 22` chrX chrY chrM `samtools view -H sorted.bam | grep '@SQ' | cut -f 2 | cut -d ':' -f 2 | grep 'chrUn\|chrEBV'` | awk '{sum+=$1} END {print sum}' >> ../LOG_hg38
  samtools view -c -F 260 sorted.bam JN707599 >> ../LOG_JN707599
  samtools view -c -F 4 sorted.bam `seq -f "chr%g" 1 22` chrX chrY chrM `samtools view -H sorted.bam | grep '@SQ' | cut -f 2 | cut -d ':' -f 2 | grep 'chrUn\|chrEBV'` | awk '{sum+=$1} END {print sum}' >> ../LOG_hg38_alignments
  samtools view -c -F 4 sorted.bam JN707599 >> ../LOG_JN707599_alignments
  cd ..
done

#for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt #1905_WaGa_wt control_MKL1 control_WaGa; do
#  echo "---- ${sample} ----" >> LOG_hg38
#  cd ${sample}
#  samtools flagstat ${sample}_cutadapted_STAR_Aligned.out.bam >> ../LOG_hg38
#  cd ..
#done
#for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
#  echo "---- ${sample} ----" >> LOG_JN707599
#  cd ${sample}
#  samtools flagstat JN707599.bam >> ../LOG_JN707599
#  cd ..
#done

for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt #1905_WaGa_wt control_MKL1 control_WaGa; do
  echo "---- ${sample} ----" >> LOG_alignments
  cd ${sample}
  samtools flagstat ${sample}2_STAR_Aligned.out.bam >> ../LOG_alignments
  cd ..
done
grep "mapped (" LOG_alignments

#The following results calculate raw counts (Note: If you only want to merge the count files, you can use -fm -fms.)
java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-12 -fdclass 1,2,3,4,5,6 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
##The following command calculate statistical test after defining case and control.
#java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5,6 -fdcase 1-10 -fdctrl 11-12 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/

# sed -i -e 's/_August/-08/g' COMPSRA_MERGE_0_miRNA.txt
# sed -i -e 's/_November/-11/g' COMPSRA_MERGE_0_miRNA.txt
# sed -i -e 's/_June/-06/g' COMPSRA_MERGE_0_miRNA.txt
sed -i -e 's/_cutadapted_STAR_Aligned_miRNA.txt//g' COMPSRA_MERGE_0_miRNA.txt
# #sed -i -e 's/_piRNA.txt//g' COMPSRA_MERGE_0_piRNA.txt
# #sed -i -e 's/_tRNA.txt//g' COMPSRA_MERGE_0_tRNA.txt
# #sed -i -e 's/_snoRNA.txt//g' COMPSRA_MERGE_0_snoRNA.txt
# #sed -i -e 's/_snRNA.txt//g' COMPSRA_DEG_0_snRNA.txt
# sed -i -e 's/_August/-08/g' COMPSRA_DEG_0_miRNA.txt
# sed -i -e 's/_November/-11/g' COMPSRA_DEG_0_miRNA.txt
# sed -i -e 's/_June/-06/g' COMPSRA_DEG_0_miRNA.txt
# sed -i -e 's/_STAR_Aligned_miRNA.txt//g' COMPSRA_DEG_0_miRNA.txt

#python3
import pandas as pd
df = pd.read_csv('COMPSRA_MERGE_0_miRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
# Assuming df is your DataFrame
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_miRNA_.txt', sep='\t')
df = pd.read_csv('COMPSRA_MERGE_0_piRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_piRNA_.txt', sep='\t')
df = pd.read_csv('COMPSRA_MERGE_0_snoRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_snoRNA_.txt', sep='\t')
df = pd.read_csv('COMPSRA_MERGE_0_snRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_snRNA_.txt', sep='\t')
df = pd.read_csv('COMPSRA_MERGE_0_tRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_tRNA_.txt', sep='\t')
#samtools flagstat **.bam
#47217410 + 0 in total (QC-passed reads + QC-failed reads)
#45166321 + 0 mapped (95.66% : N/A)
#2051089 + 0 in total (QC-passed reads + QC-failed reads)
#TODO: check the microRNA in the paper mentioned in which position?
#Single publications on EVs as transport vehicles for specific miRNAs in the pathogenesis of Merkel cell carcinoma have also been reported, such as miR-375 and its function in proliferation

~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_MERGE_0_miRNA_.txt \
COMPSRA_MERGE_0_piRNA_.txt \
COMPSRA_MERGE_0_tRNA_.txt \
COMPSRA_MERGE_0_snoRNA_.txt \
COMPSRA_MERGE_0_snRNA_.txt \
-d$'\t' -o raw_counts_hg38.xls;

# # sorting the DEG table, change the sheet labels to 'miRNA_between_columns_B-C_and_columns_D-G'
# ~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_DEG_0_miRNA.txt -d$'\t' -o normalized_and_significance_test_miRNA.xls;

##merging the row counts and statical values
#cut -f1-1 COMPSRA_MERGE_0_snoRNA.txt > f1_MERGE
#cut -f1-1 COMPSRA_DEG_0_snoRNA.txt > f1_DEG
#cut -f1-1 COMPSRA_MERGE_0_miRNA.txt > f1_MERGE
#cut -f1-1 COMPSRA_DEG_0_miRNA.txt > f1_DEG
#diff f1_MERGE f1_DEG

sT: This abbreviation could stand for various things depending on the context of the research. In some studies, it might refer to "short-term," "signal transducer," "small T antigen," or other terms relevant to the specific area of study. The exact meaning would depend on the context in which these samples are being used.

scr: This is likely short for "scramble" or "scrambled." In the context of genetic research, it often refers to a control group where cells have been treated with a scrambled sequence of RNA or DNA. This serves as a baseline to compare against other groups where specific genes are targeted.

Dox: This usually stands for "Doxorubicin," which is a type of chemotherapy medication used in cancer treatment. It's also commonly used in research to study cellular responses to chemotherapy.

DMSO: Short for "Dimethyl Sulfoxide," DMSO is an organic solvent that is often used as a vehicle in biological experiments. It's used to dissolve other substances and carry them into cells. In control experiments, DMSO is used without the active drug/compound to differentiate the effects of the solvent from the effects of the drug/compound being tested.

WaGa: This could be a specific cell line or experimental condition used in your research. Cell lines often have specific names that are abbreviated in this way.

wt: This typically stands for "wild type." In genetic research, "wild type" refers to organisms or cells that have not undergone any genetic modification and are thus considered to be the standard or normal phenotype.

Control_MKL1 and Control_WaGa (parental cell): These seem to be control samples for the experiment. "Control" indicates that these samples are used as standards or baselines for comparison. "MKL1" and "WaGa" are likely specific identifiers for the controls used.

sT vs scr (condition): This comparison could be named based on the nature of the genetic or molecular treatment in the samples. If "sT" and "scr" refer to different genetic constructs or treatments (such as a specific gene modification versus a scrambled sequence), you could name this comparison something like "Genetic Modification Comparison" or "Treatment Type Comparison." The exact name would depend on what "sT" and "scr" specifically represent in your study.

Dox vs DMSO (treatment): Since "Dox" likely stands for Doxorubicin (a chemotherapy drug) and "DMSO" is a solvent used as a control, this comparison is between a drug treatment and a control treatment. You could name this comparison "Drug Treatment vs Control" or more specifically "Doxorubicin Treatment vs Solvent Control."

WaGa vs MKL-1 (cell line): As you've identified these as cell lines, this comparison is straightforward. It's a comparison between two different cell lines. You might name it "Cell Line Comparison: WaGa vs MKL-1."

0505 vs 1905 (donor): ****

draw plots with R using DESeq2 (~/DATA/Data_Ute/Data_Ute_smallRNA_3_nextflow/README_draw_miRNA_plots)

library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
setwd("/home/jhuang/DATA/Data_Ute/Data_Ute_smallRNA_7/our_out_on_hg38+JN707599/")

d.raw<- read.delim2("COMPSRA_MERGE_0_miRNA.txt",sep="\t", header=TRUE, row.names=1)
d.raw$X <- NULL
#colnames(d.raw)<- c("WaGa_EV", "MKL1_EV", "WaGa_wt", "MKL1_wt")
d.raw[] <- lapply(d.raw, as.numeric)

#MCPyV-M1   0   0   29  0   23  0   30  8   0   0   202 196
# New row to add
new_row <- data.frame(
  X0505_WaGa_sT_DMSO = 0,
  X1905_WaGa_sT_DMSO = 0,
  X0505_WaGa_sT_Dox = 29,
  X1905_WaGa_sT_Dox = 0,
  X0505_WaGa_scr_DMSO = 23,
  X1905_WaGa_scr_DMSO = 0,
  X0505_WaGa_scr_Dox = 30,
  X1905_WaGa_scr_Dox = 8,
  X0505_WaGa_wt = 0,
  X1905_WaGa_wt = 0,
  control_MKL1 = 202,
  control_WaGa = 196,
  row.names = "MCPyV-M1"
)

# Add the new row to the data frame
d.raw <- rbind(d.raw, new_row)

#cell_line = as.factor(c("WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "MKL1","WaGa"))
#condition = as.factor(c("sT","sT", "sT","sT", "scr","scr", "scr","scr", "wt","wt", "control","control"))
#treatment = as.factor(c("DMSO","DMSO", "Dox","Dox", "DMSO","DMSO", "Dox","Dox", "wt","wt", "control","control"))

EV_or_parental = as.factor(c("EV","EV", "EV","EV", "EV","EV", "EV","EV", "EV","EV", "parental","parental"))
donor = as.factor(c("0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905"))
replicates = as.factor(c("sT_DMSO","sT_DMSO", "sT_Dox","sT_Dox", "scr_DMSO","scr_DMSO", "scr_Dox","scr_Dox", "wt","wt", "control","control"))
ids = as.factor(c("0505_WaGa_sT_DMSO","1905_WaGa_sT_DMSO","0505_WaGa_sT_Dox","1905_WaGa_sT_Dox","0505_WaGa_scr_DMSO","1905_WaGa_scr_DMSO","0505_WaGa_scr_Dox","1905_WaGa_scr_Dox","0505_WaGa_wt","1905_WaGa_wt","control_MKL1","control_WaGa"))

cData = data.frame(row.names=colnames(d.raw), replicates=replicates, ids=ids, donor=donor, EV_or_parental=EV_or_parental)
dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+donor)

rld <- rlogTransformation(dds)

#The normalized counts are calculated from the 'median of ratios method' (see the calulcation process at the paragraph 'DESeq2-normalized counts: Median of ratios method' at https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html).

# -- before pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("replicates"))
#plotPCA(rld, intgroup = c("replicates", "batch"))
#plotPCA(rld, intgroup = c("replicates", "ids"))
#plotPCA(rld, "batch")
dev.off()

png("pca2.png", 1200, 800)
plotPCA(rld, intgroup=c("donor"))
dev.off()

# -- before heatmap --
## generate the pairwise comparison between samples
library(gplots) 
library("RColorBrewer")
png("heatmap.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
#paste( rld$dex, rld$cell, sep="-" )
#rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,batch, sep=":"))
#rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()

# -- remove batch effect --
# #show the results which delete the batches effect
# #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-after-vst-are-there-still-batches-in-the-pca-plot
# mat <- assay(rld)
# mat <- limma::removeBatchEffect(mat, rld$donor)
# assay(rld) <- mat

#TODO: next time using rld
#http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#how-do-i-use-vst-or-rlog-data-for-differential-testing
#It uses the design formula to calculate the within-group variability (if blind=FALSE) or the across-all-samples variability (if blind=TRUE).
#- It is possible to visualize the transformed data with batch variation removed, using the removeBatchEffect function from limma.
#- This simply removes any shifts in the log2-scale expression data that can be explained by batch.
#IMPROVE: ~batch+replicates
#https://support.bioconductor.org/p/116375/
#Have a look at the manual pages of these functions. The first sentence of that for varianceStabilizingTransformation says "This function calculates a variance stabilizing transformation (VST) from the fitted dispersion-mean relation(s) and then transforms the count data (normalized by division by the size factors or normalization factors)." For rlog, it says "This function transforms the count data to the log2 scale in a way which minimizes differences between samples for rows with small counts, and which normalizes with respect to library size."
#Do try to read the documentation and a little bit about the underlying methods, you'll find that you'll be more efficient and have much more fun with the software.
mat <- assay(rld)
mm <- model.matrix(~replicates, colData(rld))
mat <- limma::removeBatchEffect(mat, batch=rld$donor, design=mm)
assay(rld) <- mat
#plotPCA(rld)

##https://www.biostars.org/p/403053/
##?? DESeq() function should work after removeBatchEffect ??
#dds <- DESeqDataSetFromMatrix(countData=counts, colData=factors, design = ~ Batch + Covariate)
#dds <- DESeq(dds)
#rld <- vst(dds, blind=FALSE)
#mat <- assay(rld)
#mat <- limma::removeBatchEffect(mat, rld$Batch)
#assay(rld) <- mat
#counts_batch_corrected <- assay(rld)

# -- after pca --
png("pca_after_donor_correction.png", 1200, 800)
#plotPCA(rld, intgroup = c("replicates", "batch"))
#plotPCA(rld, intgroup = c("replicates", "ids"))
plotPCA(rld, intgroup=c("replicates"))
dev.off()

png("pca_after_donor_correction2.png", 1200, 800)
plotPCA(rld, intgroup = c("donor"))
#plotPCA(rld, intgroup = c("replicates", "ids"))
#plotPCA(rld, intgroup=c("replicates"))
dev.off()

# -- after heatmap --
## generate the pairwise comparison between samples
png("heatmap_after_donor_correction.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,donor, sep=":"))
#rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()

#convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
normalized_counts <- counts(dds, normalized=TRUE)
write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
#cp COMPSRA_MERGE_0_miRNA.txt raw_counts.txt
#~/Tools/csv2xls-0.4/csv_to_xls.py raw_counts.txt normalized_counts.txt -d$'\t' -o raw_and_normalized_counts.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py normalized_counts.txt -d$'\t' -o normalized_counts_miRNA.xls;

#unter konsole
#      WaGa WaGa_EV_r1 WaGa_EV_r2       MKL1 MKL1_EV_r1 MKL1_EV_r2 
# 1/1,2948728  1/0,4790155  1/0,9996732  1/1,5667017  1/1,0727251  1/1,0755253

> sizeFactors(dds)
X0505_WaGa_sT_DMSO  X1905_WaGa_sT_DMSO   X0505_WaGa_sT_Dox   X1905_WaGa_sT_Dox 
          0.7042043           0.9986761           1.3277555           1.4950649 
X0505_WaGa_scr_DMSO X1905_WaGa_scr_DMSO  X0505_WaGa_scr_Dox  X1905_WaGa_scr_Dox 
          0.5406249           1.5657442           0.4216588           1.1627786 
      X0505_WaGa_wt       X1905_WaGa_wt        control_MKL1        control_WaGa 
          0.3560245           0.6982057           3.2447398           3.2147966

1/1,2616592=0,792607069
1/0,5302187=1,886014205
1/1,0476709=0,954498211
1/1,5243221=0,656029326
1/1,0000000=1,0
1/1,0856832=0,921079004

#bamCoverage --bam ${sample}Aligned.sortedByCoord.out.bam -o ../bigWigs/${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220

bamCoverage --bam WaGa_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_norm.bw --binSize 10 --scaleFactor 0.792607069         --effectiveGenomeSize 2864785220
bamCoverage --bam WaGa_derived_EV_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_EV_r1_norm.bw --binSize 10 --scaleFactor 1.886014205 --effectiveGenomeSize 2864785220
bamCoverage --bam WaGa_derived_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_EV_r2_norm.bw --binSize 10 --scaleFactor 0.954498211 --effectiveGenomeSize 2864785220
bamCoverage --bam MKL_1_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_norm.bw --binSize 10 --scaleFactor 0.656029326        --effectiveGenomeSize 2864785220
bamCoverage --bam MKL_1_derived_EV_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_EV_r1_norm.bw --binSize 10 --scaleFactor 1.0 --effectiveGenomeSize 2864785220
bamCoverage --bam MKL_1_derived_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_EV_r2_norm.bw --binSize 10 --scaleFactor 0.921079004 --effectiveGenomeSize 2864785220

#setwd("/home/jhuang/DATA/Data_Ute_RNASeq/results/featureCounts/degenes")
#---- * to untreated ----
dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~EV_or_parental+donor)
dds$EV_or_parental <- relevel(dds$EV_or_parental, "parental")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("EV_vs_parental")

for (i in clist) {
  contrast = paste("EV_or_parental", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  #https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
  res$padj <- ifelse(is.na(res$padj), 1, res$padj)

  res_df <- as.data.frame(res)
  write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))

  up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
  down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

~/Tools/csv2xls-0.4/csv_to_xls.py \
EV_vs_parental-all.txt \
EV_vs_parental-up.txt \
EV_vs_parental-down.txt \
-d$',' -o EV_vs_parental.xls;

dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+donor)
dds$replicates <- relevel(dds$replicates, "sT_DMSO")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("sT_Dox_vs_sT_DMSO")

dds$replicates <- relevel(dds$replicates, "scr_Dox")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("sT_Dox_vs_scr_Dox")

dds$replicates <- relevel(dds$replicates, "scr_DMSO")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("scr_Dox_vs_scr_DMSO", "sT_DMSO_vs_scr_DMSO")

for (i in clist) {
  contrast = paste("replicates", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  #https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
  res$padj <- ifelse(is.na(res$padj), 1, res$padj)

  res_df <- as.data.frame(res)
  write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))

  up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
  down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_Dox_vs_sT_DMSO-all.txt \
sT_Dox_vs_sT_DMSO-up.txt \
sT_Dox_vs_sT_DMSO-down.txt \
-d$',' -o sT_Dox_vs_sT_DMSO.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_Dox_vs_scr_Dox-all.txt \
sT_Dox_vs_scr_Dox-up.txt \
sT_Dox_vs_scr_Dox-down.txt \
-d$',' -o sT_Dox_vs_scr_Dox.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
scr_Dox_vs_scr_DMSO-all.txt \
scr_Dox_vs_scr_DMSO-up.txt \
scr_Dox_vs_scr_DMSO-down.txt \
-d$',' -o scr_Dox_vs_scr_DMSO.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_DMSO_vs_scr_DMSO-all.txt \
sT_DMSO_vs_scr_DMSO-up.txt \
sT_DMSO_vs_scr_DMSO-down.txt \
-d$',' -o sT_DMSO_vs_scr_DMSO.xls;

for i in EV_vs_wt; do
  # read files to geness_res
  echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)"

  echo "subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0))"
  echo "geness_res\$Color <- \"NS or log2FC < 2.0\""
  echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\""
  echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\""
  echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\""
  echo "geness_res\$Color[geness_res\$external_gene_name %in% intersect_891_471\$Gene] <- \"lysosomal & TFEB\""
  echo "geness_res\$Color[geness_res\$external_gene_name %in% G891_only\$Gene] <- \"lysosomal\""
  echo "geness_res\$Color[geness_res\$external_gene_name %in% G471_only\$Gene] <- \"TFEB\""
  echo "geness_res\$Color <- factor(geness_res\$Color, levels = c(\"NS or log2FC < 2.0\", \"P-adj < 0.05\", \"lysosomal & TFEB\", \"lysosomal\", \"TFEB\"))"
  echo "write.csv(geness_res, \"${i}_with_Category.csv\")"

  # pick top genes for either side of volcano to label
  # order genes for convenience:
  echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)"
  echo "top_g <- c()"
  echo "top_g <- c(top_g, \
            geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \
            geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])"
  echo "top_g <- unique(top_g)"
  echo "geness_res <- geness_res[, -1*ncol(geness_res)]"  # remove invert_P from matrix

  # Graph results
  echo "png(\"${i}.png\",width=1200, height=2000)"
  echo "ggplot(geness_res, \
      aes(x = log2FoldChange, y = -log10(pvalue), \
          color = Color, label = external_gene_name)) + \
      geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \
      geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \
      geom_point() + \
      labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \
      scale_color_manual(values = c(\"P < 0.05\"=\"darkgray\",\"P-adj < 0.05\"=\"red\",\"lysosomal\"=\"lightblue\",\"TFEB\"=\"green\",\"lysosomal & TFEB\"=\"cyan\",\"NS or log2FC < 2.0\"=\"gray\"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \
      geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = \"black\", min.segment.length = .1, box.padding = .2, lwd = 2) + \
      theme_bw(base_size = 16) + \
      theme(legend.position = \"bottom\")"
  echo "dev.off()"
done

library(ggplot2)
library(ggrepel)

geness_res <- read.csv(file = paste("EV_vs_wt", "all.txt", sep="-"), row.names=1)

external_gene_name <- rownames(geness_res)
geness_res <- cbind(geness_res, external_gene_name)
#top_g <- c("hsa-miR-10b-5p","hsa-miR-1226-5p","hsa-miR-30c-5p","hsa-mir-182","hsa-miR-182-5p",  "hsa-mir-1291","hsa-miR-1291","hsa-mir-3607","hsa-mir-3653","hsa-miR-3607-3p","hsa-miR-1244","hsa-mir-1248")
top_g <- c("hsa-mir-3607","hsa-mir-1291","hsa-mir-3653","hsa-miR-3607-3p","hsa-miR-30c-5p","hsa-miR-10b-5p","hsa-miR-182-5p","hsa-miR-1291","hsa-mir-182","hsa-miR-1244","hsa-mir-1248","hsa-miR-1226-5p","hsa-mir-1224","hsa-miR-423-5p","hsa-miR-3653-3p","hsa-miR-375","hsa-miR-15a-3p","hsa-mir-3651","hsa-miR-1246","hsa-miR-4521","hsa-miR-30a-5p","hsa-miR-425-5p","hsa-miR-9-5p","hsa-miR-664a-5p","hsa-let-7f-5p","hsa-miR-146b-5p","hsa-miR-320a","hsa-miR-3607-5p","hsa-mir-3687-1","hsa-mir-3687-2","hsa-let-7a-3","hsa-mir-6723","hsa-miR-342-3p")

subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0))
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"

#geness_res$Color[geness_res$external_gene_name %in% intersect_891_471$Gene] <- "lysosomal & TFEB"
#geness_res$Color[geness_res$external_gene_name %in% G891_only$Gene] <- "lysosomal"
#geness_res$Color[geness_res$external_gene_name %in% G471_only$Gene] <- "TFEB"
#geness_res$Color <- factor(geness_res$Color, levels = c("NS or log2FC < 2.0", "P-adj < 0.05", "lysosomal & TFEB", "lysosomal", "TFEB"))
write.csv(geness_res, "EV_vs_wt_with_Category.csv")
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
#top_g <- c()
#top_g <- c(top_g,              geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]],              geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])
#top_g <- unique(top_g)

geness_res <- geness_res[, -1*ncol(geness_res)]

#png("EV_vs_wt.png",width=1200, height=2000)
svg("EV_vs_wt.svg",width=12, height=14)
ggplot(geness_res,       aes(x = log2FoldChange, y = -log10(pvalue),           color = Color, label = external_gene_name)) +       geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +       geom_hline(yintercept = -log10(0.05), lty = "dashed") +       geom_point() +       labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") +       scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) +       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) +       theme_bw(base_size = 16) +       theme(legend.position = "bottom")
dev.off()

#print all top_g-gene labels including 
svg("EV_vs_wt.svg",width=12, height=14)
ggplot(geness_res,       aes(x = log2FoldChange, y = -log10(pvalue),           color = Color, label = external_gene_name)) +       geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +       geom_hline(yintercept = -log10(0.05), lty = "dashed") +       geom_point() +       labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") +       scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) +       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) +       theme_bw(base_size = 16) +       theme(legend.position = "bottom")
dev.off()

###################################################################
##### STEP3: prepare all_genes #####
rld <- rlogTransformation(dds)
mat <- assay(rld)
mm <- model.matrix(~replicates, colData(rld))
mat <- limma::removeBatchEffect(mat, batch=rld$donor, design=mm)
assay(rld) <- mat
RNASeq.NoCellLine <- assay(rld)
# reorder the columns
colnames(RNASeq.NoCellLine) = c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
col.order <-c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
RNASeq.NoCellLine <- RNASeq.NoCellLine[,col.order]

#Option1: filter to genes with overall lfc > 2       
datamat  <- RNASeq.NoCellLine[apply(RNASeq.NoCellLine,1,function(x){max(x)-min(x)})>=2,]

#Option2: not using the automatical comparing between max and min, rather than using manually selected from DESeq2 between conditions.
RNASeq.NoCellLine_  <- RNASeq.NoCellLine[top_g,]
colnames(RNASeq.NoCellLine_) <- c("MKL-1 Cell", "WaGa Cell", "MKL-1 EVs", "WaGa EVs") 
datamat <- RNASeq.NoCellLine_

#Option3: take all_genes        
datamat <- RNASeq.NoCellLine

#Option4: manully defining
for i in EV_vs_parental sT_Dox_vs_sT_DMSO sT_Dox_vs_scr_Dox scr_Dox_vs_scr_DMSO sT_DMSO_vs_scr_DMSO; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id
datamat = RNASeq.NoCellLine[GOI, ]

######################################################################
##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.05)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2",  "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("miRNA_heatmap.png", width=800, height=1000)
#svg("DEGs_heatmap.svg", width=6, height=8)
heatmap.2(as.matrix(datamat), 
  Rowv=as.dendrogram(hr), 
  Colv=NA, 
  dendrogram='row', 
  labRow="", 
  scale='row', 
  trace='none', 
  col=bluered(75), 
  RowSideColors=mycol, 
  srtCol=20, 
  lhei=c(1,8), 
  #cexRow=1.2,   # Increase row label font size
  cexCol=1.7,    # Increase column label font size
  margin=c(7,1)
 )
dev.off()
#ColSideColors = sampleCols,
heatmap.2(datamat,Rowv=as.dendrogram(hr),Colv=as.dendrogram(hc),
            scale='row',trace='none',col=bluered(75),
            RowSideColors = mycol, labRow="",
            main=sprintf('%s', "RNA-Seq for all DEGs"), srtCol=30)
dev.off()
#margin=c(5,5),
heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none",, sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', cexRow=1.4, lhei=c(0.25,5))
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(10,10), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none')
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(10,10), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', cexRow=1.2, lhei=c(0.1,5))
dev.off()

#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o genelist_clusters.xls

display viral transcripts found in mRNA-seq (or small RNA) MKL-1, WaGa EVs compared to parental cells (http://xgenes.com/article/article-content/87/display-viral-transcripts-found-in-mrna-seq-mkl-1-waga-evs-compared-to-cells/)

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

Small RNA sequencing processing in the example of smallRNA_7

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章