# In Memory # ========= # This next bit of code uses Bio.SeqIO.parse() to load a FASTA file, # and then turns it into an in-memory python dictionary. # This is *not* suitable for FASTA files with millions of entries. from __future__ import print_function from Bio.Alphabet import generic_dna from Bio import SeqIO def get_accession_num(seq_record): accession_atoms = seq_record.id.split('|') gb_name = accession_atoms[3] # strip the version info before returning return gb_name[:-2] rec_iterator = SeqIO.parse("ls_orchid.fasta", "fasta", generic_dna) orchid_dict = SeqIO.to_dict(rec_iterator, get_accession_num) for id_num in orchid_dict: print('id number: %s' % id_num) print('description: %s' % orchid_dict[id_num].description) print('sequence: %s' % orchid_dict[id_num].seq) # Indexed # ======= # This next version uses the Bio.SeqIO.index() function which will index # the FASTA file without loading all the records into memory at once. # This is suitable for FASTA files with millions of entries. from Bio.Alphabet import generic_dna from Bio import SeqIO def get_accession_num(record_id): accession_atoms = record_id.split('|') gb_name = accession_atoms[3] # strip the version info before returning return gb_name[:-2] orchid_dict = SeqIO.index("ls_orchid.fasta", "fasta", generic_dna) for id_num in orchid_dict: print('id number: %s' % id_num) print('description: %s' % orchid_dict[id_num].description) print('sequence: %s' % orchid_dict[id_num].seq)