%pwd
%ls
%matplotlib inline
# set up variables
RAW_DATA_FILE='itcont.txt'
SUBSET_DATA_FILE='itcont_subset.txt'
SUBSET_COUNT=8000
num_lines = sum(1 for line in open(RAW_DATA_FILE))
num_lines
import numpy as np
# create an array [0,1,2,..., num_lines-1]
idx = np.arange(num_lines)
idx[0:5]
# randomly shuffle the array
np.random.shuffle(idx)
# for kicks, check out the first few (5) numbers in the array
idx[0:5]
SUBSET_COUNT
# create a set object using the first SUBSET_COUNT items from the shuffled arrey
# membership in this (sub)set of index values will determine if a line from the file is extracted for later analysis
idx_test_set = set(idx[:SUBSET_COUNT])
# FOR THIS EXAMPLE: the random subset will be extracted without loading the entire dataset into memory
import sys
# set our index counter to zero
idx_count = 0
output_count = 0
# open file for writing, and the file for reading
with open(SUBSET_DATA_FILE,"w") as output_file :
with open(RAW_DATA_FILE,"r") as input_file:
# prime the variable with the first line from the file
next_line = input_file.readline()
# if the line read from the file is empty, the End Of File has been reached
while next_line != '' :
# test to see if the current line is in the previously defined set of line numbers (indexes) to extract
if idx_count in idx_test_set :
output_file.write(next_line)
output_count += 1
if output_count % 1000 == 0 :
sys.stdout.write('.')
next_line = input_file.readline()
idx_count += 1
print
num_lines_subset = sum(1 for line in open(SUBSET_DATA_FILE))
num_lines_subset
ndtype=[('Filer Identification Number',(str,9)),
('Amendment Indicator',(str,1)),
('Report Type',(str,3)),
('Primary-General Indicator',(str,5)),
('Microfilm Location (YYOORRRFFFF) ',(str,11)),
('Transaction Type',(str,3)),
('Entity Type',(str,3)),
('Contributor/Lender/Transfer Name',(str,200)),
('City/Town',(str,30)),
('State',(str,2)),
('Zip Code',(str,9)),
('Employer',(str,38)),
('Occupation',(str,38)),
('Transaction Date(MMDDYYYY)',int),
('Transaction Amount',float),
('Other Identification Number',(str,9)),
('Transaction ID',(str,32)),
('File Number / Report ID',float),
('Memo Code',(str,1)),
('Memo Text',(str,100)),
('FEC Record Number',float),]
dataset = np.genfromtxt(SUBSET_DATA_FILE,
delimiter='|', invalid_raise=False,
# usecols=tuple(np.array([4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,])-1),
dtype=ndtype)
dataset['State'][1:5]
from collections import defaultdict
state_count={}
def default_value () :
return 0
state_count=defaultdict(default_value,state_count)
# alternately:
#state_count=defaultdict(lambda:0,state_count)
for entry in dataset['State'] :
state_count[entry] = state_count[entry] + 1
state_count
import operator
# sort the dictinary using the term 1 (the count)
sorted_state_count = sorted(state_count.iteritems(), key=operator.itemgetter(1))
# convert the dictionary into an array and display
np.asarray(sorted_state_count)
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
def plot_bar_chart(counts_dict) :
fig = plt.figure(facecolor='white',dpi=72)
fig.set_size_inches(20,20)
axes = plt.axes(frameon=False)
# this locator puts ticks at regular intervals
loc = plticker.MultipleLocator(base=1.0)
axes.xaxis.set_major_locator(loc)
# plot the bars
axes.bar(range(len(counts_dict)), np.asarray(counts_dict)[:,1].astype(float), align="center", width=0.5, alpha=0.5)
axes.set_xticks(range(len(counts_dict)))
axes.set_xticklabels(np.asarray(counts_dict)[:,0])
axes.set_title("state breakdown")
plot_bar_chart(sorted_state_count)