genetica:bioinf_process:fastqc:script1
#The first script summarizes the summary.txt file for all samples
# It is located at:
/bonn_data/Bonn_0_fastq/fastqcRawdata/summarizing_fastq_stats.sh
#It can be run by simply writing
$ ./summarizing_fastq_stats.sh
and it outputs to screen and to a file
- summarizing_fastq_stats.sh
#!/bin/bash # summarizing_fastq_stats.sh is a bash program made by vifehe to summarize the statistics outputs from fastaq/summary.txt # #The summary output is: #[vifehe@detritus 1_paraparesia_fastaq]$ cat paraparesia_fastq_QC/SN7570192_15190_P4H11_L5150_1_sequence.fq_fastqc/summary.txt #PASS Basic Statistics SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 1 #PASS Per base sequence quality SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 2 #PASS Per sequence quality scores SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 3 #PASS Per base sequence content SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 4 #PASS Per base GC content SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 5 #WARN Per sequence GC content SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 6 #PASS Per base N content SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 7 #PASS Sequence Length Distribution SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 8 #WARN Sequence Duplication Levels SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 9 #PASS Overrepresented sequences SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 10 #PASS Kmer Content SN7570192_15190_P4H11_L5150_1_sequence.fq.gz 11 idir=fastqcRawdata_P1 ofile=${idir}.sumstats touch $ofile printf "#BS = Basic statistics\n#PBSQ = Per base sequence quality\n#PSQS = Per sequence quality scores\n#PBSQ = Per base sequence content\n#bCG = Per base GC content\n#sGC = Per sequence GC content\n#bN = Per base N content\n#SLD = Sequence Length Distribution\n#SDL = Sequence Duplication Levels\n#OS = Overrepresented sequences\n#KC = Kmer Content\nSample\tBS\tPBSQ\tPSQS\tPBSQ\tbGC\tsGC\tbN\tSLD\tSDL\tOS\tKC\n" >> $ofile for x in $idir/*.fq_fastqc/summary.txt do echo $x sample=(`echo $x | awk -F "/" {'print $2'} | awk -F"_" {'print $4"-"$5'}`) #this should output L5150-1 echo $sample basic_stats=(`cat $x | sed -n '1p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $basic_stats per_base_seq_qual=(`cat $x | sed -n '2p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $per_base_seq_qual per_seq_qual_scores=(`cat $x | sed -n '3p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $per_seq_qual_scores per_base_seq_content=(`cat $x | sed -n '4p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $per_base_seq_content per_base_GC_content=(`cat $x | sed -n '5p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $per_base_GC_content per_seq_GC_content=(`cat $x | sed -n '6p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $per_seq_GC_content per_base_N_content=(`cat $x | sed -n '7p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $per_base_N_content seq_length_distr=(`cat $x | sed -n '8p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $seq_length_distr seq_dupl_level=(`cat $x | sed -n '9p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $seq_dupl_level overrepresented=(`cat $x | sed -n '10p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $overrepresented kmer_content=(`cat $x | sed -n '11p' | awk -F"\t" {'print $1'}`) # this should output the filter status of basic statistics echo $kmer_content printf "$sample\t$basic_stats\t$per_base_seq_qual\t$per_seq_qual_scores\t$per_base_seq_content\t$per_base_GC_content\t$per_seq_GC_content\t$per_base_N_content\t$seq_length_distr\t$seq_dupl_level\t$overrepresented\t$kmer_content\n" >> $ofile done
# a bit of the output is:
[vifehe@detritus fastqcRawdata]$ head -n20 fastqcRawdata_P1.sumstats #BS = Basic statistics #PBSQ = Per base sequence quality #PSQS = Per sequence quality scores #PBSQ = Per base sequence content #bCG = Per base GC content #sGC = Per sequence GC content #bN = Per base N content #SLD = Sequence Length Distribution #SDL = Sequence Duplication Levels #OS = Overrepresented sequences #KC = Kmer Content Sample BS PBSQ PSQS PBSQ bGC sGC bN SLD SDL OS KC MND1014-2 PASS PASS PASS PASS PASS WARN PASS PASS WARN PASS PASS MND116-1 PASS PASS PASS PASS PASS WARN PASS PASS WARN PASS PASS MND116-2 PASS PASS PASS PASS PASS WARN PASS PASS WARN PASS PASS MND126-1 PASS PASS PASS PASS PASS WARN PASS PASS WARN PASS PASS MND126-2 PASS PASS PASS PASS PASS WARN PASS PASS WARN PASS PASS MND1405-1 PASS PASS PASS PASS PASS WARN PASS PASS WARN PASS PASS MND1405-2 PASS PASS PASS PASS PASS WARN PASS PASS WARN PASS PASS MND1493-1 PASS PASS PASS PASS PASS WARN PASS PASS WARN PASS PASS
genetica/bioinf_process/fastqc/script1.txt · Last modified: 2020/08/04 10:58 by 127.0.0.1