"""

cd DAT6-Material
git pull
ipython notebook
---- or -----
git clone https://github.com/fidsteve/DAT6-Material.git

"""

%matplotlib inline

# set up variables
RAW_DATA_FILE='itcont.txt'
SUBSET_DATA_FILE='itcont_subset.txt'
SUBSET_COUNT=8000

num_lines = sum(1 for line in open(RAW_DATA_FILE))
num_lines

883835

import numpy as np

# create an array [0,1,2,..., num_lines]
idx = np.arange(num_lines)

# randomly shuffle the array
np.random.shuffle(idx)

# for kicks, check out the first few (5) numbers in the array
idx[0:5]

array([507359,  68863, 813902, 757656, 746310])

# create a set object using the first SUBSET_COUNT items from the shuffled arrey
# membership in this (sub)set of index values will determine if a line from the file is extracted for later analysis

idx_test_set = set(idx[:SUBSET_COUNT])

# FOR THIS EXAMPLE: the random subset will be extracted without loading the entire dataset into memory

import sys

# set our index counter to zero
idx_count = 0
output_count = 0

# open file for writing, and the file for reading
with open(SUBSET_DATA_FILE,"w") as output_file :
    with open(RAW_DATA_FILE,"r") as input_file:
        # prime the variable with the first line from the file
        next_line = input_file.readline()
        
        # if the line read from the file is empty, the End Of File has been reached
        while next_line != '' :
            # test to see if the current line is in the previously defined set of line numbers (indexes) to extract
            if idx_count in idx_test_set :
                output_file.write(next_line)   
                output_count += 1
                if output_count % 1000 == 0 :
                    sys.stdout.write('.')
            next_line = input_file.readline()
            idx_count += 1
        print
        
num_lines_subset = sum(1 for line in open(SUBSET_DATA_FILE))
num_lines_subset

........

8000

import pandas as pd

file_fields = ['Filer Identification Number',
'Amendment Indicator',
'Report Type',
'Primary-General Indicator',
'Microfilm Location (YYOORRRFFFF)',
'Transaction Type',
'Entity Type',
'Contributor/Lender/Transfer Name',
'City/Town',
'State',
'Zip Code',
'Employer',
'Occupation',
'Transaction Date(MMDDYYYY)',
'Transaction Amount',
'Other Identification Number',
'Transaction ID',
'File Number / Report ID',
'Memo Code',
'Memo Text',
'FEC Record Number']

dataset = pd.read_table(RAW_DATA_FILE, sep='|', 
                        header=None, names=file_fields, 
                        usecols=[9])

dataset[0:3]

state_count = dataset.groupby('State').count()
state_count[0:5]

sorted_state_count=state_count.sort_index(by='State', 
                                          ascending=True)['State']

sorted_state_count.plot(kind='barh', stacked=False, alpha=0.5)

<matplotlib.axes.AxesSubplot at 0x10b260210>

import matplotlib.pyplot as plt
import matplotlib.ticker as plticker

def plot_bar_chart(counts_pd) :
    fig = plt.figure(facecolor='white',dpi=72)
    fig.set_size_inches(20,20)
    axes = plt.axes(frameon=False)

    # this locator puts ticks at regular intervals
    loc = plticker.MultipleLocator(base=1.0) 
    axes.xaxis.set_major_locator(loc)

    # plot the bars
    #axes.bar(range(len(counts_dict)), np.asarray(counts_dict)[:,1].astype(float), align="center", width=0.5, alpha=0.5)
    axes.set_xticks(range(len(counts_pd)))
    axes.set_xticklabels(counts_pd.index)
    
    counts_pd.plot(kind='bar', ax=axes, stacked=False, alpha=0.5)

    axes.set_title("Donation Count by State")

plot_bar_chart(sorted_state_count)

	State
0	WV
1	SC
2	SC

	State
State
AA	4
AE	6
AK	3200
AL	8803
AP	2