# In Memory
# =========
# This next bit of code uses Bio.SeqIO.parse() to load a FASTA file,
# and then turns it into an in-memory python dictionary.
# This is *not* suitable for FASTA files with millions of entries.

from __future__ import print_function

from Bio.Alphabet import generic_dna
from Bio import SeqIO


def get_accession_num(seq_record):
    accession_atoms = seq_record.id.split('|')
    gb_name = accession_atoms[3]
    # strip the version info before returning
    return gb_name[:-2]

rec_iterator = SeqIO.parse("ls_orchid.fasta", "fasta", generic_dna)
orchid_dict = SeqIO.to_dict(rec_iterator, get_accession_num)

for id_num in orchid_dict:
    print('id number: %s' % id_num)
    print('description: %s' % orchid_dict[id_num].description)
    print('sequence: %s' % orchid_dict[id_num].seq)


# Indexed
# =======
# This next version uses the Bio.SeqIO.index() function which will index
# the FASTA file without loading all the records into memory at once.
# This is suitable for FASTA files with millions of entries.

from Bio.Alphabet import generic_dna
from Bio import SeqIO


def get_accession_num(record_id):
    accession_atoms = record_id.split('|')
    gb_name = accession_atoms[3]
    # strip the version info before returning
    return gb_name[:-2]

orchid_dict = SeqIO.index("ls_orchid.fasta", "fasta", generic_dna)

for id_num in orchid_dict:
    print('id number: %s' % id_num)
    print('description: %s' % orchid_dict[id_num].description)
    print('sequence: %s' % orchid_dict[id_num].seq)