Code
The following algorithms are useful in identification of translated small open reading frames via peptidomics.
1. A Python script to remove symbols assigned to stop codons in fasta-formatted 3-frame translation files prior to proteomic database construction.
Remove_stops_from_fasta.py
import os
import sys
#### SPECIFY YOUR FILES HERE ####
#input_file=’example.txt’
#output_file=’output.txt’
input_file=sys.argv[1]
output_file=sys.argv[2]
####################
fp=open(input_file,’r’)
lines=fp.readlines()
fp.close()
#print len(lines)
#print xy
id_map={}
output_fp=open(output_file,’w’)
counter=0
for line in lines:
cur_line=line.rstrip()
cur_line=cur_line.replace(‘**’,’*’)
cur_line=cur_line.replace(‘***’,’*’)
cur_line=cur_line.replace(‘****’,’*’)
cur_line=cur_line.replace(‘*****’,’*’)
cur_line=cur_line.replace(‘******’,’*’)
cur_line=cur_line.replace(‘*******’,’*’)
cur_line=cur_line.replace(‘********’,’*’)
cur_line=cur_line.replace(‘*********’,’*’)
cur_line=cur_line.replace(‘**********’,’*’)
cur_line=cur_line.replace(‘***********’,’*’)
if ‘>’ in cur_line and ‘)’ in cur_line:
arr=line.split(‘>’)
temp=arr[1]
arr2=temp.split(‘)’)
identifier=’>’+str(arr2[0]) + ‘)’
#output_fp.write(cur_line+’\n’);
if counter>0:
#print temp_count
if temp_count>7:
output_fp.write(temp_line+’\n’)
if identifier in id_map:
id_map[identifier]+=1
temp_line=identifier+’_’+str(id_map[identifier])+’\n’
temp_count=0
else:
id_map[identifier]=1
temp_line=identifier+’\n’
temp_count=0
else:
if ‘*’ in cur_line:
arr=cur_line.split(‘*’)
arr_counter=0
for a in arr:
if arr_counter==0:
temp_count+=len(a)
temp_line=temp_line+a
else:
#print temp_count
if temp_count>7:
#print temp_line
output_fp.write(temp_line+’\n’)
if identifier in id_map:
id_map[identifier]+=1
temp_line=identifier+’_’+str(id_map[identifier])+’\n’+a
temp_count=0
else:
id_map[identifier]=1
temp_line=identifier+’\n’+a
temp_count=0
#temp_count=0
#temp_line=temp_line+a
temp_count=len(a)
arr_counter+=1
else:
temp_count+=len(cur_line)
temp_line=temp_line+cur_line
counter+=1
output_fp.close()
2. A Python script to remove assembled transcripts with FKPM values below a selected cutoff prior to proteomic database construction.
FKPM_filter.py
import os
expression_file = ‘isoforms.fpkm_tracking’
expr={}
with open(expression_file,’r’) as f:
k=0
for line in f:
if k==0:
k+=1
else:
words=line.split(‘\t’)
fpkm=words [9]
id=words[0]
expr[id]=float(fpkm)
cutoff=1
useCutoff = True
if useCutoff:
GTF = ‘transcripts_filtered’+str(cutoff)+’.gtf’
outfile = ‘transcripts_filtered’+str(cutoff)+’.fa’
else:
GTF = ‘transcripts_nofilter.gtf’
outfile = ‘transcripts_nofilter.fa’
with open(GTF,’w’) as outf:
with open(‘transcripts.gtf’, ‘r’) as f:
for line in f:
if line[0]== ‘#’:
pass
else:
words= line.split(‘\t’)
transcript_id= words[8].split(‘;’)[1].split(‘"’)[1]
if useCutoff:
if transcript_id in expr and expr[transcript_id] > cutoff:
outf.write(‘\t’.join(words))
else:
outf.write(‘\t’.join(words))
# switch to the path of your genome fasta file if you have one.
os.system(‘gffread -w ‘+outfile+’ -g hg19.fa ‘+GTF)
3. A Perl script to remove tryptic peptide sequences that exactly match annotated proteins in a current proteome build (downloaded locally).
peptide_filter.pl
#!/usr/bin/perl
# look for exact peptide matches in a protein fasta file
# Cache which proteins have which 5-aa sequences in them
# Then for a given peptide, look at the list of protein containing the first
# five aa, and do a simple substring match
use strict;
use warnings;
BEGIN {eval{ require Smart::Comments; import Smart::Comments (‘###’)};}
# hoh uses 1.2GB on uniprothum. 1.5 for hoa, but only 60s
# hostrings use 800M in 60s
my $Seed_Length = 4;
# String in outfile when you don’t find a protein
my $Not_Found_String = “NOT_FOUND”;
use Getopt::Long;
my ($peps_file, $fasta_file, $out_file);
my $Usage = “
perl find_pep_in_protein.pl -pep peptide_list.txt -fasta human.protein.faa -out results.txt
This will read a one-column list of peptides from peptide_list.txt
and a FASTA protein file, and send results to results.txt
The result file is two tab-delimited columns: peptide, protein ID.
If the peptide wasn’t found, this column will say $Not_Found_String.
”;
GetOptions(
“out_file=s” => \$out_file,
“peps_file=s” => \$peps_file,
“fasta_file=s” => \$fasta_file,
) or die $Usage;
die “Not enough arguments:\n$Usage\n”
unless defined($out_file) && defined($peps_file) && defined($fasta_file);
my $prots_ref = read_fasta($fasta_file);
my %seed_proteins; # which proteins contain each small e.g., 5-aa seed seq?
my $seq_count = 0;
foreach my $seqid (keys %$prots_ref) {
my $seq = $prots_ref->{$seqid}{sequence};
my $max_l = length ($seq) - $Seed_Length;
for (my $i = 0; $i <= $max_l; $i++) {
my $seed = substr($seq, $i, $Seed_Length);
# Assume repeats are rare enough to ignore duplicates of a given seed
$seed_proteins{$seed}.=”:$seqid”;
#push @{$seed_proteins{$seed}}, $seqid++;
}
$seq_count++;
if ($seq_count % 1000 == 0) {
warn “$seq_count done. “,scalar(keys %seed_proteins), ” seeds\n”;
}
}
open my $peps_fh, “<”, $peps_file
or die “Can’t open peps file $peps_file: $!\n”;
my %found;
my @not_found;
open my $out_fh, “>”, $out_file
or die “Can’t open out file $out_file: $!\n”;
#print “Starting search at “, scalar(localtime), “\n”;
while (<$peps_fh>) {
s/\r?\n//;
my $peptide = $_;
my $seed1 = substr($peptide, 0, $Seed_Length);
my $seed2 = substr($peptide, -$Seed_Length);
if ($seed1 eq $seed2) {
warn “$seed1 at beginning and end of $peptide\n”;
my @prot_ids = split /:/, $seed_proteins{$seed1};
shift @prot_ids; # empty first id
my %done;
foreach my $prot_id (@prot_ids) {
my $prot_seq = $prots_ref->{$prot_id}{sequence};
if (index($prot_seq, $peptide) != -1) {
$found{$peptide} = $prot_id;
print $out_fh “$peptide\t$prot_id\n”;
last;
}
}
}
else {
my @prot_ids1 = split /:/, $seed_proteins{$seed1};
shift @prot_ids1;
my @prot_ids2_tmp = split /:/, $seed_proteins{$seed2};
shift @prot_ids2_tmp;
my %prot_ids2 = map {($_=>1)} @prot_ids2_tmp;
foreach my $prot_id1 (@prot_ids1) {
if (exists $prot_ids2{$prot_id1}) { # both seeds in same protein
my $prot_seq = $prots_ref->{$prot_id1}{sequence};
if (index($prot_seq, $peptide) != -1) {
$found{$peptide} = $prot_id1;
print $out_fh “$peptide\t$prot_id1\n”;
last;
}
}
}
}
if (! $found{$peptide}) {
print $out_fh “$peptide\t$Not_Found_String\n”;
push @not_found, $peptide;
}
}
close $peps_fh;
close $out_fh;
warn scalar(@not_found), ” peptides NOT found, “, scalar(keys %found), ” found\n”;
#print “Ending search at “, scalar(localtime), “\n”;
exit;
################################################################################
sub read_fasta {
my ($fasta_in) = @_;
open my $fh, ‘<’, $fasta_in or die “FASTA in $fasta_in: $!\n”;
my %seqs = ();
my $current_id;
while (<$fh>) {
s/\r?\n//; # chomp even Windows \r\n
if (/>(\S+)(\s+(.*))?/) {
$current_id = $1;
my $desc = defined $2 ? $3: “”;
$seqs{$current_id} = {
sequence => “”,
description => $desc,
id => $current_id,
}
} else {
$seqs{$current_id}{sequence} .= $_;
}
}
close $fh;
return \%seqs;
}