﻿#!/usr/bin/env python

#########################################################################################################################
#Script to filter fasta sequences based on a list of desired sequences													#
#By David Peris {Genetics Lab, Biotechnology Center UW-Madison, WI}														#
#																														#
#usage: python filter_fasta.py																							#
#Introduce your output extension name & your desired list of sequences													#
#FASTA files requires to end in .fas, do not include extra "." in the name file											#
#No include folders in the directory where your fasta files are found													#
#Your desired list of sequences must contain each ID sequence in each row												#
#																														#
#																														#
#########################################################################################################################

import os
from Bio import SeqIO

#Section for selecting a name extension for your fasta sequences
print "Type a name extension for your new fasta files"
output_addition_name = raw_input()

#Section for selecting the file where your desired sequences are contained
print "Type the name of your list of desired sequences"
desired_sequences = raw_input()

#We want to filter the fasta files in our current path
current_path = 0
current_path = os.getcwd()

print "filtering files in", current_path

#We will generate a list of fasta files in the current path and fas extension will be modified to fasta extension

list_of_files = 0
list_of_files = os.listdir(current_path)

fasta_files = []

for file in list_of_files:
	name = file.split('.')
	if name[1] == "fasta":
		fasta_files.append(file)
	if name[1] == "fas":
		fasta_files.append(file)

#All files have been introduced in our list, however we are going to filter to keep fasta files in a list

wanted = set(line.rstrip("\n").split(None,1)[0] for line in open(desired_sequences))
print "Found %i unique identifiers in %s" % (len(wanted), desired_sequences)

for fasta_alignment in fasta_files:
	fasta_alignment_name = fasta_alignment.split('.')
	output_file = fasta_alignment_name[0] + "_" + output_addition_name + "." + fasta_alignment_name[1]
	records = (r for r in SeqIO.parse(fasta_alignment, "fasta") if r.id in wanted)
	count = SeqIO.write(records, output_file, "fasta")
	print "Saved %i records from %s to %s" % (count, fasta_alignment, output_file)
	if count < len(wanted):
		print "Warning %i IDs not found in %s" % (len(wanted)-count, fasta_alignment)

print "Done"
