%pwd
%ls

DAT6-Lab-2014-04-14--16.ipynb  PEP_2013_PEPANNRES/            indiv14.zip                    itcont_subset.txt
DAT6-Lab-2014-04-14.ipynb      PEP_2013_PEPANNRES.zip         itcont.txt                     itcont_subset.txt-1

%matplotlib inline

# set up variables
RAW_DATA_FILE='itcont.txt'
SUBSET_DATA_FILE='itcont_subset.txt'
SUBSET_COUNT=8000

num_lines = sum(1 for line in open(RAW_DATA_FILE))
num_lines

883835

import numpy as np

# create an array [0,1,2,..., num_lines-1]
idx = np.arange(num_lines)
idx[0:5]

array([0, 1, 2, 3, 4])

# randomly shuffle the array
np.random.shuffle(idx)

# for kicks, check out the first few (5) numbers in the array
idx[0:5]

SUBSET_COUNT

8000

# create a set object using the first SUBSET_COUNT items from the shuffled arrey
# membership in this (sub)set of index values will determine if a line from the file is extracted for later analysis

idx_test_set = set(idx[:SUBSET_COUNT])

# FOR THIS EXAMPLE: the random subset will be extracted without loading the entire dataset into memory

import sys
# set our index counter to zero
idx_count = 0
output_count = 0

# open file for writing, and the file for reading
with open(SUBSET_DATA_FILE,"w") as output_file :
    with open(RAW_DATA_FILE,"r") as input_file:
        # prime the variable with the first line from the file
        next_line = input_file.readline()
        
        # if the line read from the file is empty, the End Of File has been reached
        while next_line != '' :
            # test to see if the current line is in the previously defined set of line numbers (indexes) to extract
            if idx_count in idx_test_set :
                output_file.write(next_line)   
                output_count += 1
                if output_count % 1000 == 0 :
                    sys.stdout.write('.')
            next_line = input_file.readline()
            idx_count += 1
        print

........

num_lines_subset = sum(1 for line in open(SUBSET_DATA_FILE))
num_lines_subset

8000

ndtype=[('Filer Identification Number',(str,9)),
        ('Amendment Indicator',(str,1)),
        ('Report Type',(str,3)),
        ('Primary-General Indicator',(str,5)),
        ('Microfilm Location (YYOORRRFFFF) ',(str,11)),
        ('Transaction Type',(str,3)),
        ('Entity Type',(str,3)),
        ('Contributor/Lender/Transfer Name',(str,200)),
        ('City/Town',(str,30)),
        ('State',(str,2)),
        ('Zip Code',(str,9)),
        ('Employer',(str,38)),
        ('Occupation',(str,38)),
        ('Transaction Date(MMDDYYYY)',int),
        ('Transaction Amount',float),
        ('Other Identification Number',(str,9)),
        ('Transaction ID',(str,32)),
        ('File Number / Report ID',float),
        ('Memo Code',(str,1)),
        ('Memo Text',(str,100)),
        ('FEC Record Number',float),]


dataset = np.genfromtxt(SUBSET_DATA_FILE,
                  delimiter='|', invalid_raise=False,
                 # usecols=tuple(np.array([4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,])-1),
                  dtype=ndtype)

dataset['State'][1:5]

array(['DC', 'MD', 'NY', 'AL'], 
      dtype='|S2')

from collections import defaultdict
state_count={}

def default_value () :
    return 0

state_count=defaultdict(default_value,state_count)

# alternately:
#state_count=defaultdict(lambda:0,state_count)

for entry in dataset['State'] :
    state_count[entry] = state_count[entry] + 1

state_count

defaultdict(<function default_value at 0x1081e8410>, {'': 17, 'ZZ': 4, 'WA': 191, 'DE': 24, 'DC': 264, 'WI': 103, 'WV': 37, 'HI': 47, 'FL': 433, 'WY': 26, 'PR': 14, 'NJ': 220, 'NM': 64, 'TX': 652, 'LA': 104, 'NC': 176, 'ND': 32, 'NE': 56, 'TN': 118, 'NY': 539, 'PA': 275, 'AK': 26, 'NV': 41, 'NH': 43, 'VA': 348, 'GU': 1, 'CO': 165, 'VI': 2, 'CA': 927, 'AL': 77, 'AR': 61, 'VT': 18, 'IL': 337, 'GA': 230, 'IN': 96, 'IA': 100, 'OK': 71, 'AZ': 142, 'ID': 19, 'CT': 152, 'ME': 31, 'MD': 189, 'MA': 334, 'OH': 203, 'UT': 19, 'MO': 91, 'MN': 162, 'MI': 267, 'RI': 28, 'KS': 88, 'MT': 36, 'MS': 34, 'SC': 77, 'KY': 75, 'OR': 78, 'SD': 32})

import operator
# sort the dictinary using the term 1 (the count)
sorted_state_count = sorted(state_count.iteritems(), key=operator.itemgetter(1))
# convert the dictionary into an array and display
np.asarray(sorted_state_count)

array([['GU', '1'],
       ['VI', '2'],
       ['ZZ', '4'],
       ['PR', '14'],
       ['', '17'],
       ['VT', '18'],
       ['ID', '19'],
       ['UT', '19'],
       ['DE', '24'],
       ['WY', '26'],
       ['AK', '26'],
       ['RI', '28'],
       ['ME', '31'],
       ['ND', '32'],
       ['SD', '32'],
       ['MS', '34'],
       ['MT', '36'],
       ['WV', '37'],
       ['NV', '41'],
       ['NH', '43'],
       ['HI', '47'],
       ['NE', '56'],
       ['AR', '61'],
       ['NM', '64'],
       ['OK', '71'],
       ['KY', '75'],
       ['AL', '77'],
       ['SC', '77'],
       ['OR', '78'],
       ['KS', '88'],
       ['MO', '91'],
       ['IN', '96'],
       ['IA', '100'],
       ['WI', '103'],
       ['LA', '104'],
       ['TN', '118'],
       ['AZ', '142'],
       ['CT', '152'],
       ['MN', '162'],
       ['CO', '165'],
       ['NC', '176'],
       ['MD', '189'],
       ['WA', '191'],
       ['OH', '203'],
       ['NJ', '220'],
       ['GA', '230'],
       ['DC', '264'],
       ['MI', '267'],
       ['PA', '275'],
       ['MA', '334'],
       ['IL', '337'],
       ['VA', '348'],
       ['FL', '433'],
       ['NY', '539'],
       ['TX', '652'],
       ['CA', '927']], 
      dtype='|S3')

import matplotlib.pyplot as plt
import matplotlib.ticker as plticker

def plot_bar_chart(counts_dict) :
    fig = plt.figure(facecolor='white',dpi=72)
    fig.set_size_inches(20,20)
    axes = plt.axes(frameon=False)

    # this locator puts ticks at regular intervals
    loc = plticker.MultipleLocator(base=1.0) 
    axes.xaxis.set_major_locator(loc)

    # plot the bars
    axes.bar(range(len(counts_dict)), np.asarray(counts_dict)[:,1].astype(float), align="center", width=0.5, alpha=0.5)
    axes.set_xticks(range(len(counts_dict)))
    axes.set_xticklabels(np.asarray(counts_dict)[:,0])

    axes.set_title("state breakdown")

plot_bar_chart(sorted_state_count)