Code

The following algorithms are useful in identification of translated small open reading frames via peptidomics.

1. A Python script to remove symbols assigned to stop codons in fasta-formatted 3-frame translation files prior to proteomic database construction.
 
Remove_stops_from_fasta.py
 
import os 
import sys
 
#### SPECIFY YOUR FILES HERE ####
#input_file=’example.txt’
#output_file=’output.txt’
 
input_file=sys.argv[1]
output_file=sys.argv[2]
 
####################
 
fp=open(input_file,’r’)
lines=fp.readlines()
fp.close()
 
#print len(lines)
#print xy
 
id_map={}
 
output_fp=open(output_file,’w’)
counter=0
 
for line in lines:
 
cur_line=line.rstrip()
cur_line=cur_line.replace(‘**’,’*’)
cur_line=cur_line.replace(‘***’,’*’)
cur_line=cur_line.replace(‘****’,’*’)
cur_line=cur_line.replace(‘*****’,’*’)
cur_line=cur_line.replace(‘******’,’*’)
cur_line=cur_line.replace(‘*******’,’*’)
cur_line=cur_line.replace(‘********’,’*’)
cur_line=cur_line.replace(‘*********’,’*’)
cur_line=cur_line.replace(‘**********’,’*’)
cur_line=cur_line.replace(‘***********’,’*’)
 
 
if ‘>’ in cur_line and ‘)’ in cur_line:
arr=line.split(‘>’)
temp=arr[1]
arr2=temp.split(‘)’)
 
identifier=’>’+str(arr2[0]) + ‘)’
#output_fp.write(cur_line+’\n’);
 
 
if counter>0:
#print temp_count
if temp_count>7:
output_fp.write(temp_line+’\n’)
 
if identifier in id_map:
id_map[identifier]+=1
 
temp_line=identifier+’_’+str(id_map[identifier])+’\n’
temp_count=0
else:
id_map[identifier]=1
 
temp_line=identifier+’\n’
temp_count=0
 
 
 
else:
if ‘*’ in cur_line:
arr=cur_line.split(‘*’)
 
 
arr_counter=0
for a in arr:
if arr_counter==0:
temp_count+=len(a)
temp_line=temp_line+a
else:
#print temp_count
if temp_count>7:
#print temp_line
output_fp.write(temp_line+’\n’)
 
if identifier in id_map:
id_map[identifier]+=1
 
temp_line=identifier+’_’+str(id_map[identifier])+’\n’+a
temp_count=0
else:
id_map[identifier]=1
 
temp_line=identifier+’\n’+a
temp_count=0
 
#temp_count=0
#temp_line=temp_line+a
temp_count=len(a)
arr_counter+=1
 
else:
 
temp_count+=len(cur_line)
temp_line=temp_line+cur_line
 
 
 
counter+=1
 
output_fp.close()
 
2. A Python script to remove assembled transcripts with FKPM values below a selected cutoff prior to proteomic database construction.
 
FKPM_filter.py
 
import os
expression_file = ‘isoforms.fpkm_tracking’
 
expr={}
with open(expression_file,’r’) as f:
k=0 
for line in f:
if k==0:
k+=1
else:
words=line.split(‘\t’)
fpkm=words [9]
id=words[0]
expr[id]=float(fpkm)
 
cutoff=1
useCutoff = True
if useCutoff:
GTF = ‘transcripts_filtered’+str(cutoff)+’.gtf’
outfile = ‘transcripts_filtered’+str(cutoff)+’.fa’
else:
GTF = ‘transcripts_nofilter.gtf’
outfile = ‘transcripts_nofilter.fa’
 
 
with open(GTF,’w’) as outf:
with open(‘transcripts.gtf’, ‘r’) as f:
for line in f:
if line[0]== ‘#’:
pass
else:
words= line.split(‘\t’)
transcript_id= words[8].split(‘;’)[1].split(‘"’)[1]
if useCutoff:
if transcript_id in expr and expr[transcript_id] > cutoff:
outf.write(‘\t’.join(words))
else:
outf.write(‘\t’.join(words))
 
# switch to the path of your genome fasta file if you have one.
os.system(‘gffread -w ‘+outfile+’ -g hg19.fa ‘+GTF)
 

3. A Perl script to remove tryptic peptide sequences that exactly match annotated proteins in a current proteome build (downloaded locally).

peptide_filter.pl

#!/usr/bin/perl
 
# look for exact peptide matches in a protein fasta file
# Cache which proteins have which 5-aa sequences in them
# Then for a given peptide, look at the list of protein containing the first
# five aa, and do a simple substring match
 
use strict;
use warnings;
BEGIN {eval{ require Smart::Comments; import Smart::Comments (‘###’)};}
 
# hoh uses 1.2GB on uniprothum. 1.5 for hoa, but only 60s
# hostrings use 800M in 60s
my $Seed_Length = 4; 
# String in outfile when you don’t find a protein
my $Not_Found_String = “NOT_FOUND”;
 
use Getopt::Long;
my ($peps_file, $fasta_file, $out_file);
my $Usage = “
perl find_pep_in_protein.pl -pep peptide_list.txt -fasta human.protein.faa -out results.txt
This will read a one-column list of peptides from peptide_list.txt
    and a FASTA protein file, and send results to results.txt
The result file is two tab-delimited columns: peptide, protein ID.
If the peptide wasn’t found, this column will say $Not_Found_String.
”;
GetOptions(
“out_file=s” => \$out_file,
“peps_file=s” => \$peps_file,
“fasta_file=s” => \$fasta_file,
) or die $Usage;
die “Not enough arguments:\n$Usage\n”
    unless defined($out_file) && defined($peps_file) && defined($fasta_file);
 
my $prots_ref = read_fasta($fasta_file);
my %seed_proteins; # which proteins contain each small e.g., 5-aa seed seq?
my $seq_count = 0;
foreach my $seqid (keys %$prots_ref) {
    my $seq = $prots_ref->{$seqid}{sequence};
    my $max_l = length ($seq) - $Seed_Length;
    for (my $i = 0; $i <= $max_l; $i++) {
my $seed = substr($seq, $i, $Seed_Length);
# Assume repeats are rare enough to ignore duplicates of a given seed
$seed_proteins{$seed}.=”:$seqid”;
#push @{$seed_proteins{$seed}}, $seqid++;
    }
    $seq_count++;
    if ($seq_count % 1000 == 0) {
warn “$seq_count done. “,scalar(keys %seed_proteins), ” seeds\n”;
    }
}
 
open my $peps_fh, “<”, $peps_file
    or die “Can’t open peps file $peps_file: $!\n”;
my %found;
my @not_found;
open my $out_fh, “>”, $out_file
    or die “Can’t open out file $out_file: $!\n”;
 
#print “Starting search at “, scalar(localtime), “\n”;
while (<$peps_fh>) {
    s/\r?\n//;
    my $peptide = $_;
    my $seed1 = substr($peptide, 0, $Seed_Length);
    my $seed2 = substr($peptide, -$Seed_Length);
    if ($seed1 eq $seed2) {
warn “$seed1 at beginning and end of $peptide\n”;
my @prot_ids = split /:/, $seed_proteins{$seed1};
shift @prot_ids; # empty first id
my %done;
foreach my $prot_id (@prot_ids) {
   my $prot_seq = $prots_ref->{$prot_id}{sequence};
   if (index($prot_seq, $peptide) != -1) {
$found{$peptide} = $prot_id;
print $out_fh “$peptide\t$prot_id\n”;
last;
   }
}
    }
    else {
my @prot_ids1 = split /:/, $seed_proteins{$seed1};
shift @prot_ids1;
my @prot_ids2_tmp = split /:/, $seed_proteins{$seed2};
shift @prot_ids2_tmp;
my %prot_ids2 = map {($_=>1)} @prot_ids2_tmp;
 
foreach my $prot_id1 (@prot_ids1) {
   if (exists $prot_ids2{$prot_id1}) { # both seeds in same protein
my $prot_seq = $prots_ref->{$prot_id1}{sequence};
if (index($prot_seq, $peptide) != -1) {
   $found{$peptide} = $prot_id1;
   print $out_fh “$peptide\t$prot_id1\n”;
   last;
}
   }
}
    }
 
    if (! $found{$peptide}) {
print $out_fh “$peptide\t$Not_Found_String\n”;
push @not_found, $peptide;
    }
}
close $peps_fh;
close $out_fh;
warn scalar(@not_found), ” peptides NOT found, “, scalar(keys %found), ” found\n”;
#print “Ending search at “, scalar(localtime), “\n”;
 
 
exit;
 
################################################################################
sub read_fasta {
    my ($fasta_in) = @_;
    open my $fh, ‘<’, $fasta_in or die “FASTA in $fasta_in: $!\n”;
    my %seqs = ();
    my $current_id;
    while (<$fh>) {
s/\r?\n//;  # chomp even Windows \r\n
if (/>(\S+)(\s+(.*))?/) {
   $current_id = $1;
   my $desc = defined $2 ? $3: “”;
   $seqs{$current_id} = {
sequence  => “”,
description  => $desc,
id  => $current_id,
   }
} else {
   $seqs{$current_id}{sequence} .= $_;
}
    }
    close $fh;
    return \%seqs;