################################################# ###### ###### ###### 千万注意:脚本会删除 OUT_DIR 下的所有文件 ###### Usage: ###### 1. Reset BIN variable in calc_icSHAPE function ###### 2. Copy two functions' code to terminal and run your data as example shows ###### ###### ################################################# ########### 0. remove rRNA reads from raw fastq function remove_rRNA() { # Check true parameters if (($# < 3)); then echo "Usage: remove_rRNA output_fastq input_fastq rRNA_index [QUEUE THREADS_NUM]"; return; fi output_fastq=$1 input_fastq=$2 rRNA_index=$3 QUEUE=Z-ZQF THREADS=20 if (($# >= 4)); then QUEUE=$4; fi if (($# >= 5)); then THREADS=$5; fi bsub -q $QUEUE -e error -o log -n $THREADS \ "bowtie2 \ -U $input_fastq \ -x $rRNA_index \ --local \ --very-sensitive-local \ -p $THREADS | \ awk 'BEGIN{total=0;rRNA=0;stderr=\"cat >&2\"}{total+=1;if(\$3==\"*\"){print \"@\"\$1\"\\n\"\$10\"\\n+\\n\"\$11;}else{rRNA+=1;}}END{printf \"%s === total:%d, left: %d, ratio: %.2f%%\\n\", \"$input_fastq\", total, total-rRNA, 100*(total-rRNA)/total | stderr}' \ > $output_fastq" } ############ # preserve the read if both of pair can not map to rRNA ############ function remove_rRNA_pairend() { # Check true parameters if (($# < 3)); then echo "Usage: remove_rRNA output_fastq_1 output_fastq_2 input_fastq_1 input_fastq_2 rRNA_index [QUEUE THREADS_NUM]"; return; fi output_fastq_1=$1 output_fastq_2=$2 input_fastq_1=$3 input_fastq_2=$4 rRNA_index=$5 QUEUE=Z-ZQF THREADS=20 if (($# >= 6)); then QUEUE=$4; fi if (($# >= 7)); then THREADS=$5; fi rm -f $output_fastq_1 $output_fastq_2 bsub -q $QUEUE -e error -o log -n $THREADS \ "bowtie2 --non-deterministic \ -x $rRNA_index \ -1 $input_fastq_1 -2 $input_fastq_2 \ --very-sensitive-local \ -p $THREADS \ --local \ | samtools view \ | awk 'BEGIN{total=0;rRNA=0;stderr=\"cat >&2\";first_line=\"\";} { if(first_line==\"\") { total+=1; first_line = \$0; }else{ split(first_line, array, \"\\t\"); if(array[1] != \$1) { print \"FATAL Error: different read name -- %s;%s\",array[1],\$1 | stderr; exit(-1); } if(array[3]==\"*\" && \$3==\"*\") { printf \"@%s\\n%s\\n+\\n%s\\n\",array[1],array[10],array[11] >> \"$output_fastq_1\"; printf \"@%s\\n%s\\n+\\n%s\\n\",\$1,\$10,\$11 >> \"$output_fastq_2\"; }else{ rRNA+=1; } first_line=\"\"; } } END{ printf \"%s === total:%d, left: %d, ratio: %.2f%%\\n\", \"$input_fastq_1\", total, total-rRNA, 100*(total-rRNA)/total | stderr }'" } ########### 1. map fastq read to genome index function icSHAPE_map() { # Check true parameters if (($# < 3)); then echo "Usage: icSHAPE_map input_fastq output_prefix genome_index [QUEUE] [THREADS]"; return; fi input_fastq=$1 output_prefix=$2 index=$3 QUEUE=Z-ZQF THREADS=20 if (($# >= 4)); then QUEUE=$4; fi if (($# >= 5)); then THREADS=$5; fi output_ref_map=${output_prefix}.map.sam output_unmap=${output_prefix}.unmap.fastq log_file=${output_prefix}.log error_file=${output_prefix}.error job_name=${output_prefix##*/} bsub -q $QUEUE -e $error_file -o $log_file -n $THREADS -J $job_name \ "bowtie2 \ -U $input_fastq \ -x $index \ --non-deterministic \ --time \ --norc \ -p $THREADS | \ awk 'BEGIN{ mCount=0; rCount=0; uCount=0; tCount=0; } { if(substr(\$0,1,1)==\"@\"){ print \$0 > \"$output_ref_map\"; } else if(\$2==0){ print \$0 > \"$output_ref_map\"; mCount+=1; tCount+=1; } else if(\$2==4){ uCount+=1; tCount+=1; printf \"@%s\\n%s\\n+\\n%s\\n\",\$1,\$10,\$11 > \"$output_unmap\"; } else if(\$2==16){ rCount+=1; tCount+=1; } else { print \"Impossible\!\!\"; } } END{ printf \"map:%s(%.2f%%)\\treverse_map:%s(%.2f%%)\\tunmap:%s(%.2f%%)\\n\",mCount,100.0*mCount/tCount,rCount,100.0*rCount/tCount,uCount,100.0*uCount/tCount }'"; } ########### 2. calculate icSHAPE score with sam files function job_id() { echo $2 | gawk 'match($0, /<([0-9]+)>/, id){print id[1]}' } function calc_icSHAPE() { # root-file-name OUT_DIR=$1 # sam-file-name SAM_D1=$2 SAM_D2=$3 SAM_N1=$4 SAM_N2=$5 # Check true parameters if [ $# < 5 ]; then echo "Usage: calc_icSHAPE out_dir DMSO_1 DMSO_2 NAI_1 NAI_2 [QUEUE] [THREADS]"; return; fi # remove all files in $OUT_DIR mkdir -p $OUT_DIR rm -rfi $OUT_DIR/* # ===========> Reset this path <========== # icSHAPE scripts directory BIN=/Share/home/zhangqf/usr/icSHAPE/scripts # ===========> icSHAPE parameters <========== # ( all parameters are from icSHAPE pipeline default parameters ) MIN_RPKM=5 # filter all transcript with RPKM lower than 5 NORM_METHOD=mean:vigintile2 HEAD_SKIP=32 TAIL_SKIP=32 SCALING_FORM=100 DIV_FACTOR=10 SUB_FACTOR=0.25 WINSOR_METHOD=factor5:scaling1 INPUT_COVERAGE=200 # a lower cutoff => 100 TARGET_HIT=2 # a lower cutoff => 1 HEAD_NULL=5 TAIL_NULL=30 # Cluster parameters THREADS=5 QUEUE=Z-ZQF if (($# >= 6)); then QUEUE=$6; fi if (($# >= 7)); then THREADS=$7; fi job_name=${OUT_DIR##*/} # input files RT_D1=$OUT_DIR/rt_D1.txt RT_D2=$OUT_DIR/rt_D2.txt RT_N1=$OUT_DIR/rt_N1.txt RT_N2=$OUT_DIR/rt_N2.txt # output files RPKM_D1=$OUT_DIR/rpkm_D1.txt RPKM_D2=$OUT_DIR/rpkm_D2.txt RPKM_N1=$OUT_DIR/rpkm_N1.txt RPKM_N2=$OUT_DIR/rpkm_N2.txt RT_D=$OUT_DIR/rt_D.txt RT_N=$OUT_DIR/rt_N.txt RT_N_D=$OUT_DIR/rt_norm_D.txt RT_N_N=$OUT_DIR/rt_norm_N.txt ENRICH=$OUT_DIR/shape.enrich SHAPE=$OUT_DIR/shape.out # Group id GROUP_ID="/Calc_Shape/"$RANDOM echo -e "Current Group ID: "${GROUP_ID} ## 1. estimate RPKM BSUB_1=$(bsub -n $THREADS -J ${job_name}_estimateRPKM_D1 -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/estimateRPKM.pl -i $SAM_D1 -o $RPKM_D1") BSUB_2=$(bsub -n $THREADS -J ${job_name}_estimateRPKM_D2 -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log \ "perl $BIN/estimateRPKM.pl -i $SAM_D2 -o $RPKM_D2") BSUB_3=$(bsub -n $THREADS -J ${job_name}_estimateRPKM_N1 -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log \ "perl $BIN/estimateRPKM.pl -i $SAM_N1 -o $RPKM_N1") BSUB_4=$(bsub -n $THREADS -J ${job_name}_estimateRPKM_N2 -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log \ "perl $BIN/estimateRPKM.pl -i $SAM_N2 -o $RPKM_N2") ## 2. calcRT BSUB_5=$(bsub -n $THREADS -J ${job_name}_calcRT_D1 -w "done($(job_id $BSUB_1))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/calcRT.pl -i $SAM_D1 -o $RT_D1 -r $RPKM_D1 -c $MIN_RPKM") BSUB_6=$(bsub -n $THREADS -J ${job_name}_calcRT_D2 -w "done($(job_id $BSUB_2))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/calcRT.pl -i $SAM_D2 -o $RT_D2 -r $RPKM_D2 -c $MIN_RPKM") BSUB_7=$(bsub -n $THREADS -J ${job_name}_calcRT_N1 -w "done($(job_id $BSUB_3))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/calcRT.pl -i $SAM_N1 -o $RT_N1 -r $RPKM_N1 -c $MIN_RPKM") BSUB_8=$(bsub -n $THREADS -J ${job_name}_calcRT_N2 -w "done($(job_id $BSUB_4))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/calcRT.pl -i $SAM_N2 -o $RT_N2 -r $RPKM_N2 -c $MIN_RPKM") ## 3. combineRTreplicates BSUB_9=$(bsub -n $THREADS -J ${job_name}_combine_D -w "done($(job_id $BSUB_5))&&done($(job_id $BSUB_6))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/combineRTreplicates.pl -i $RT_D1:$RT_D2 -o $RT_D") BSUB_10=$(bsub -n $THREADS -J ${job_name}_combine_N -w "done($(job_id $BSUB_7))&&done($(job_id $BSUB_8))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/combineRTreplicates.pl -i $RT_N1:$RT_N2 -o $RT_N") ## 4. normalizeRTfile BSUB_11=$(bsub -n $THREADS -J ${job_name}_norm_D -w "done($(job_id $BSUB_9))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/normalizeRTfile.pl -i $RT_D -o $RT_N_D -m $NORM_METHOD -d $HEAD_SKIP -l $TAIL_SKIP -f $SCALING_FORM") BSUB_12=$(bsub -n $THREADS -J ${job_name}_norm_N -w "done($(job_id $BSUB_10))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/normalizeRTfile.pl -i $RT_N -o $RT_N_N -m $NORM_METHOD -d $HEAD_SKIP -l $TAIL_SKIP -f $SCALING_FORM") ## 5. calcEnrich BSUB_13=$(bsub -n $THREADS -J ${job_name}_enrich_N -w "done($(job_id $BSUB_11))&&done($(job_id $BSUB_12))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/calcEnrich.pl -f $RT_N_N -b $RT_N_D -o $ENRICH -w $WINSOR_METHOD -y $DIV_FACTOR -x $SUB_FACTOR -e complex") ## 6. filterEnrich BSUB_14=$(bsub -n $THREADS -J ${job_name}_filter_N -w "done($(job_id $BSUB_13))" -q $QUEUE -e $OUT_DIR/error -o $OUT_DIR/log -g $GROUP_ID \ "perl $BIN/filterEnrich.pl -i $ENRICH -o $SHAPE -t $INPUT_COVERAGE -T $TARGET_HIT -s $HEAD_NULL -e $TAIL_NULL") } ################ Example ################ ### 1. Map icSHAPE_map D1.sam D1.fq genome_index icSHAPE_map D2.sam D2.fq genome_index icSHAPE_map N1.sam N1.fq genome_index icSHAPE_map N2.sam N2.fq genome_index After finish.... ### 2. calc icSHAPE scores calc_icSHAPE OUT_DIR D1.sam D2.sam N1.sam N2.sam