"""
cd DAT6-Material
git pull
ipython notebook
---- or -----
git clone https://github.com/fidsteve/DAT6-Material.git
"""
%matplotlib inline
# set up variables
RAW_DATA_FILE='itcont.txt'
SUBSET_DATA_FILE='itcont_subset.txt'
SUBSET_COUNT=8000
num_lines = sum(1 for line in open(RAW_DATA_FILE))
num_lines
import numpy as np
# create an array [0,1,2,..., num_lines]
idx = np.arange(num_lines)
# randomly shuffle the array
np.random.shuffle(idx)
# for kicks, check out the first few (5) numbers in the array
idx[0:5]
# create a set object using the first SUBSET_COUNT items from the shuffled arrey
# membership in this (sub)set of index values will determine if a line from the file is extracted for later analysis
idx_test_set = set(idx[:SUBSET_COUNT])
# FOR THIS EXAMPLE: the random subset will be extracted without loading the entire dataset into memory
import sys
# set our index counter to zero
idx_count = 0
output_count = 0
# open file for writing, and the file for reading
with open(SUBSET_DATA_FILE,"w") as output_file :
with open(RAW_DATA_FILE,"r") as input_file:
# prime the variable with the first line from the file
next_line = input_file.readline()
# if the line read from the file is empty, the End Of File has been reached
while next_line != '' :
# test to see if the current line is in the previously defined set of line numbers (indexes) to extract
if idx_count in idx_test_set :
output_file.write(next_line)
output_count += 1
if output_count % 1000 == 0 :
sys.stdout.write('.')
next_line = input_file.readline()
idx_count += 1
print
num_lines_subset = sum(1 for line in open(SUBSET_DATA_FILE))
num_lines_subset
import pandas as pd
file_fields = ['Filer Identification Number',
'Amendment Indicator',
'Report Type',
'Primary-General Indicator',
'Microfilm Location (YYOORRRFFFF)',
'Transaction Type',
'Entity Type',
'Contributor/Lender/Transfer Name',
'City/Town',
'State',
'Zip Code',
'Employer',
'Occupation',
'Transaction Date(MMDDYYYY)',
'Transaction Amount',
'Other Identification Number',
'Transaction ID',
'File Number / Report ID',
'Memo Code',
'Memo Text',
'FEC Record Number']
dataset = pd.read_table(RAW_DATA_FILE, sep='|',
header=None, names=file_fields,
usecols=[9])
dataset[0:3]
state_count = dataset.groupby('State').count()
state_count[0:5]
sorted_state_count=state_count.sort_index(by='State',
ascending=True)['State']
sorted_state_count.plot(kind='barh', stacked=False, alpha=0.5)
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
def plot_bar_chart(counts_pd) :
fig = plt.figure(facecolor='white',dpi=72)
fig.set_size_inches(20,20)
axes = plt.axes(frameon=False)
# this locator puts ticks at regular intervals
loc = plticker.MultipleLocator(base=1.0)
axes.xaxis.set_major_locator(loc)
# plot the bars
#axes.bar(range(len(counts_dict)), np.asarray(counts_dict)[:,1].astype(float), align="center", width=0.5, alpha=0.5)
axes.set_xticks(range(len(counts_pd)))
axes.set_xticklabels(counts_pd.index)
counts_pd.plot(kind='bar', ax=axes, stacked=False, alpha=0.5)
axes.set_title("Donation Count by State")
plot_bar_chart(sorted_state_count)