My code loops through multiple files in a directory, parses each file and appends the parsed content of each file to FinalVariantfile.txt
.
The code works, but duplicates the content of each file.
When I ran the code with two files the output contained 4 files. Could someone please explain why this is happening and how to fix this?
#!/usr/bin/perl -w
use strict;
#directory structure
my $home = "/data/";
my $tsvdirectory = $home . "test_all_runs/" . $ARGV[0];
my $tsvfiles = $home . "test_all_runs/" . $ARGV[0] . "/tsv_files.txt";
my $FinalVariants = $home . "test_all_runs/" . $ARGV[0] . "/FinalVariantfile.txt";
my @tsvfiles = ();
my @currentlines = ();
my $currentline = '';
my $currentCNVline = '';
my @currentCNVlines = ();
my @HotSpotLines = ();
my @CNVLines = ();
# command to produce the vcf_files.txt file stored in each individual run
# directory; the file list includes solely vcf files which have not been
# previously prepared and/or annotated
my $cmd = `ls $tsvdirectory/FOCUS*.tsv > $tsvfiles`;
# print "$cmd";
my $cmda = "ls $tsvdirectory/FOCUS*.tsv > $tsvfiles";
# print "$cmda";
# this code opens the vcf_files.txt file and passes each line into an array for
# indidivudal manipulation
open( TXT2, "$tsvfiles" );
while ( <TXT2> ) {
push( @tsvfiles, $_ );
}
close(TXT2);
foreach ( @tsvfiles ) {
chop($_);
}
# this code then parses each of the files listed by name in the tsvfiles array
foreach ( @tsvfiles ) {
my $currenttsvfile = "$_"; # establishes the current file being manipulated
my $MDLfinaltsvfile = $currenttsvfile;
$MDLfinaltsvfile =~ s/.tsv/_prepared.txt/g;
# this series of variable calls names the various intermediate or
# final output files
my $MDLlinestsvfile = $currenttsvfile;
$MDLlinestsvfile =~ s/.tsv/_withCNV.txt/g;
my $Variantlinestsvfile = $currenttsvfile;
$Variantlinestsvfile =~ s/.tsv/_HotSpot.txt/g;
my $MDLtsvfile = $currenttsvfile;
$MDLtsvfile =~ s/.tsv/_FilteredAllcolumns.txt/g;
my $MDLsampleid = $currenttsvfile;
$MDLsampleid =~ s/-oncogene.tsv//g;
print "The currentVCFis############# " . $currenttsvfile . "
";
my @SampleID = ();
@SampleID = split ///, $MDLsampleid;
print "The sampleIDis##############" . $SampleID[4] . "
";
my $CNVdata = $currenttsvfile;
$CNVdata =~ s/.tsv/_cnv.txt/g;
my $FinalCNVdata = $currenttsvfile;
$FinalCNVdata =~ s/.tsv/_finalcnv.txt/g;
my $cmd2 = `fgrep -v "#" $currenttsvfile > $MDLlinestsvfile`;
print "$cmd2"; # this code extracts from the current vcf file all of the
# lines of data and outputs them into a separate file
my $cmd5 = `grep -vwE "(CNV|intronic|synonymous|utr_3|utr_5)"
#removes lines that contain CNV/intronic/synonymous/utr_3/utr_5"
$MDLlinestsvfile > $Variantlinestsvfile`;
print "$cmd5";
open( my $fh_in, '<', $Variantlinestsvfile )
or die "cannot open $Variantlinestsvfile: $!
";
#removes lines that contain 0/0 and ./. genotypes from field 70.
open( my $fh_out, '>', $MDLtsvfile )
or die "cannot open $MDLtsvfile: $!
";
while ( my $line = <$fh_in> ) {
# tab/field-based:
my @fields = split( /s+/, $line );
print $fh_out $line unless ( $fields[70] =~ m|([0.])/1| );
}
close($fh_in);
close($fh_out);
#open each filtered file with all columns and pushes it into array.
open( TXT2, "$MDLtsvfile" );
while (<TXT2>) {
push( @HotSpotLines, $_ );
}
close(TXT2);
foreach (@HotSpotLines) {
chop($_);
my @HotSpotEntries = ();
my $currentMDLline = $_;
@HotSpotEntries = split( / /, $currentMDLline );
my $chr = $HotSpotEntries[9];
my $position = $HotSpotEntries[10];
my $cosmicids = $HotSpotEntries[21];
my $refforward = $HotSpotEntries[67];
my $genotype = $HotSpotEntries[70];
my $altforward = $HotSpotEntries[77];
my $altreverse = $HotSpotEntries[78];
my $cDNA = $HotSpotEntries[81];
my $exon = $HotSpotEntries[83];
my $conseq = $HotSpotEntries[84];
my $location = $HotSpotEntries[88];
my $geneclass = $HotSpotEntries[92];
my $aachange = $HotSpotEntries[98];
my $transcript = $HotSpotEntries[100];
$currentline
= $SampleID[4] . " "
. $chr . " "
. $position . " "
. $cosmicids . " "
. $refforward . " "
. $refreverse . " "
. $genotype . " "
. $altforward . " "
. $altreverse . " "
. $cDNA . " "
. $exon . " "
. $conseq . " "
. $location . " "
. $geneclass . " "
. $aachange . " "
. $transcript;
# print "The currentVCFlineis ".$currentline."
";
push( @currentlines, $currentline );
}
my $i;
for ( $i = 0; $i < @currentlines; $i += 1 ) {
my $currentguiline = $currentlines[$i];
my $cmd5 = `echo "$currentguiline" >> $FinalVariants`;
print "$cmd5";
#my $cmd9 = `sed -i '1i$SampleID[4]' $FinalVariants`; print $cmd9;
}
}
See Question&Answers more detail:os