# In Memory
# =========
# This next bit of code uses Bio.SeqIO.parse() to load a FASTA file,
# and then turns it into an in-memory python dictionary.
# This is *not* suitable for FASTA files with millions of entries.

from Bio.Alphabet import generic_dna
from Bio import SeqIO

def get_accession_num(seq_record):
    accession_atoms = seq_record.id.split('|')
    gb_name = accession_atoms[3]
    # strip the version info before returning
    return gb_name[:-2]

rec_iterator = SeqIO.parse("ls_orchid.fasta","fasta", generic_dna)
orchid_dict = SeqIO.to_dict(rec_iterator, get_accession_num)

for id_num in orchid_dict:
    print 'id number:', id_num
    print 'description:', orchid_dict[id_num].description
    print 'sequence:', orchid_dict[id_num].seq


# Indexed
# =======
# This next version uses the Bio.SeqIO.index() function which will index
# the FASTA file without loading all the records into memory at once.
# This is suitable for FASTA files with millions of entries.

from Bio.Alphabet import generic_dna
from Bio import SeqIO

def get_accession_num(record_id):
    accession_atoms = record_id.split('|')
    gb_name = accession_atoms[3]
    # strip the version info before returning
    return gb_name[:-2]

orchid_dict = SeqIO.index("ls_orchid.fasta","fasta", generic_dna)

for id_num in orchid_dict:
    print 'id number:', id_num
    print 'description:', orchid_dict[id_num].description
    print 'sequence:', orchid_dict[id_num].seq