#!/usr/local/bin/perl #=========================================================================================== # # Sequencing result file management: find out which plasmid every sequence is from; # Get the 3' sequence; For sequences which have polyA, output ten nucleotides before # polyA and after CATG, calculate the distance between CATG and polyA; For polyT # sequences, get the ten nucleotides after polyT and before the CATG then calculate # the distance between polyT and CATG. Translate the ten nucleotides after polyT to its # complementary sequences. # # Created by: Lihua Jiang 07/23/03 # Last time revise: 08/08/03 # #============================================================================================ use strict; my($line, $Id, $sequence, $boolean); my($i, $pos1, $pos2, $lengthT, $lengthA, $poly); my ( $position1, $position2, $Tag1, $Tag2, $temp1, $temp2); my ($Fivelement,$FiveAfterPlyA); my @IDlist = (); my @seqList =(); my @SplitList = (); my @Element = (); #======================================== # read in FASTA files # Push IDs and corresponding sequences # int to lists. #======================================== unless( open(FILE,"$ARGV[0]")){ die("Cannot open input file\n"); } $boolean = 0; $line = ; while($line ne ""){ if( $line =~ /^>/ ){ if($boolean == 0){ $boolean = 1; } else{ push(@seqList, $sequence); $sequence = ''; } chop($line); $Id = substr($line, 1); push(@IDlist, $Id); } else{ chop($line); $sequence .= $line; } $line = ; if($line eq ""){ push(@seqList, $sequence); $line = ; } } close(FILE); #deal with every sequence, results are outputted to output.txt file under the same directory open (OUTPUTFILE, ">output.txt"); select(OUTPUTFILE); print ("\npolyAorT MinimalTag SequenceName NlaIII Tag Distance Sequence identifier\n"); for($i = 0; $i < @seqList; $i++){ #==================================================================================== # deal with sequences which have polyT: # extract 5 or less nucleotides before polyT and ten after polyT and # ten nucleotides before CATG. Calculate the distance between polyT and CATG #===================================================================================== if($seqList[$i] =~ /T{8,}/){ $poly = "PolyT"; # get segements which have five nucleotides before polyT and ten nucleotides after polyT # from every sequences and store it in $temp1. $seqList[$i] =~ /.{0,5}T{8,}.{10}/; $temp1 = $&; #split the segment by ployT so the first 5 will be in $Element[0], #and nucleotides after ployT will be in $Element[1] @Element = split(/T{8,}/, $temp1); $Element[1] = substr($temp1,length($temp1)-10,10); # test if the 5 nuclotides are from the correct plasmids # test if the 5 nuclotides are from the correct plasmids # if it is not output 'X' if( $Element[0] eq "ACGCA" ){ $Fivelement = "H"; }elsif( $Element[0] eq "ACCGA"){ $Fivelement = "I"; }elsif( $Element[0] eq "TCGCA"){ $Fivelement = "J"; }elsif($Element[0] eq "TCCGA"){ $Fivelement = "K"; }else{ $Fivelement = "X"; } #reverse the ten nucleotides after polyT and #translate it to its complementary sequence $pos1 = index($seqList[$i], $Element[1]); $Element[1] =~ tr/ATCG/TAGC/; $Element[1] = reverse($Element[1]); #find the the position of CATG which is after polyT #then calculate distance between ployT and CATG #if there is no CATG after polyT, set distance 0 and Tag is "nothing" $pos2 = index($seqList[$i], "CATG", $pos1 + 10); if($pos2 != -1){ $Tag1 = substr($seqList[$i], $pos2 - 10, 10); $Tag1 =~ tr/ATCG/TAGC/; $Tag1 = reverse($Tag1); $lengthT = $pos2 - $pos1 + 1; } else{ $Tag1 = "*nothing*"; $lengthT = 0; } #======================================================================== # output result to output.txt file. # poly represents if the sequence has polyA or polyT if it has neither # of them, use polyQ to represent # 3prime represents ten nucleotides before polyA or after polyT # plasmid represents five nucleotides after polyA or before polyT # Tag represents ten nucleotides before CATG in polyT sequences # or after CATG in polyA sequences. # Distance represents the number of nucleotides between polyT(or polyA) # and CATG. #======================================================================= $~ = "MYFORMAT"; write; format MYFORMAT = @<<<<< @<<<<<<<<<< @< @<<<<<<<<<< @<<< @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $poly, $Element[1], $Fivelement, $Tag1, $lengthT, $IDlist[$i] . } #==================================================================================== # deal with sequences which have polyA only: # extract 5 or less nucleotides before polyA and ten after polyA and # ten nucleotides after CATG. Calculate the distance between polyT and CATG #================================================================================== if($seqList[$i] =~ /A{8,}/ && $seqList[$i] !~ /T{8,}/){ $poly = "PolyA"; while($seqList[$i] =~ /.{10}A{8,}.{0,5}/g){ $temp2 = $&; } #split the segment by ployA so the first 10 will be in $SplitList[0], #the last five will be in $SplitList[1] @SplitList = split(/A{8,}/, $temp2); $SplitList[0] = substr($temp2, 0, 10); $SplitList[1] = substr($temp2, length($temp2)-5, 5); # test if the 5 nuclotides are from the correct plasmids # if it is not output 'X' if( $SplitList[1] eq "TGCGT" ){ $FiveAfterPlyA = "H" ; }elsif($SplitList[1] eq "TCGGT"){ $FiveAfterPlyA = "I"; }elsif($SplitList[1] eq "TGCGA"){ $FiveAfterPlyA = "J"; }elsif($SplitList[1] eq "TCGGA"){ $FiveAfterPlyA = "K"; }else{ $FiveAfterPlyA = "X"; } #find the the position of CATG which is before polyA #then calculate distance between ployA and CATG #if there is no CATG before polyA, set distance 0 and Tag is "nothing" $seqList[$i] =~ /A{8,}/; $position1 = rindex($seqList[$i], $&); $position2 = rindex($seqList[$i], "CATG", $position1 - 10 ); if($position2 != -1){ $Tag2 = substr($seqList[$i],$position2 + 5, 10); $lengthA = $position1 - ($position2 + 5); } else{ $Tag2 = "*nothing*"; $lengthA = 0; } # the output format is the same as the polyT sequences $~ = "MYFORMAT2"; write; format MYFORMAT2 = @<<<<< @<<<<<<<<<< @< @<<<<<<<<<< @<<< @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $poly, $SplitList[0], $FiveAfterPlyA, $Tag2, $lengthA, $IDlist[$i] . } # if the sequence has neither polyA or polyT, use polyQ and nothing, X, 0 to represent respectively if($seqList[$i] !~ /A{8,}/ && $seqList[$i] !~ /T{8,}/){ print("PolyQ *nothing* X *nothing* 0 $IDlist[$i]\n"); } } close(OUTPUTFILE);