Lab

Twitter & MongoDB

Alessandro D. Gagliardi

Importing Libraries

# standard library:
import os  

from pprint import pprint

# other modules:
import matplotlib.pyplot as plt
import pandas as pd
import twitter
import yaml

from pymongo import MongoClient

Twitter Trends

Connect to Twitter:¶

credentials = yaml.load(open(os.path.expanduser('~/api_cred.yml')))

auth = twitter.oauth.OAuth(credentials['ACCESS_TOKEN'], 
                           credentials['ACCESS_TOKEN_SECRET'],
                           credentials['API_KEY'],
                           credentials['API_SECRET'])

twitter_api = twitter.Twitter(auth=auth)

# The Yahoo! Where On Earth ID for the entire world is 1.
# See https://dev.twitter.com/docs/api/1.1/get/trends/place and
# http://developer.yahoo.com/geo/geoplanet/

WORLD_WOE_ID = 1
US_WOE_ID = 23424977

# Prefix ID with the underscore for query string parameterization.
# Without the underscore, the twitter package appends the ID value
# to the URL itself as a special case keyword argument.

world_trends = twitter_api.trends.place(_id=WORLD_WOE_ID)
us_trends = twitter_api.trends.place(_id=US_WOE_ID)

pprint(world_trends)

[{u'as_of': u'2014-04-20T02:17:08Z',
  u'created_at': u'2014-04-20T02:12:36Z',
  u'locations': [{u'name': u'Worldwide', u'woeid': 1}],
  u'trends': [{u'name': u'#SiempreMePasaEstoDe',
               u'promoted_content': None,
               u'query': u'%23SiempreMePasaEstoDe',
               u'url': u'http://twitter.com/search?q=%23SiempreMePasaEstoDe'},
              {u'name': u'#MahoganyRT',
               u'promoted_content': None,
               u'query': u'%23MahoganyRT',
               u'url': u'http://twitter.com/search?q=%23MahoganyRT'},
              {u'name': u'#ClanessaNoAltasHoras',
               u'promoted_content': None,
               u'query': u'%23ClanessaNoAltasHoras',
               u'url': u'http://twitter.com/search?q=%23ClanessaNoAltasHoras'},
              {u'name': u'Happy Easter Sunday',
               u'promoted_content': None,
               u'query': u'%22Happy+Easter+Sunday%22',
               u'url': u'http://twitter.com/search?q=%22Happy+Easter+Sunday%22'},
              {u'name': u'#KeepUpIndonesia',
               u'promoted_content': None,
               u'query': u'%23KeepUpIndonesia',
               u'url': u'http://twitter.com/search?q=%23KeepUpIndonesia'},
              {u'name': u'#EsDePocoHombre',
               u'promoted_content': None,
               u'query': u'%23EsDePocoHombre',
               u'url': u'http://twitter.com/search?q=%23EsDePocoHombre'},
              {u'name': u'Mark Herberholz',
               u'promoted_content': None,
               u'query': u'%22Mark+Herberholz%22',
               u'url': u'http://twitter.com/search?q=%22Mark+Herberholz%22'},
              {u'name': u'Jesus is Alive',
               u'promoted_content': None,
               u'query': u'%22Jesus+is+Alive%22',
               u'url': u'http://twitter.com/search?q=%22Jesus+is+Alive%22'},
              {u'name': u'30-27 Werdum',
               u'promoted_content': None,
               u'query': u'%2230-27+Werdum%22',
               u'url': u'http://twitter.com/search?q=%2230-27+Werdum%22'},
              {u'name': u'1458 Espectadores',
               u'promoted_content': None,
               u'query': u'%221458+Espectadores%22',
               u'url': u'http://twitter.com/search?q=%221458+Espectadores%22'}]}]

pprint(us_trends)

[{u'as_of': u'2014-04-20T02:17:09Z',
  u'created_at': u'2014-04-20T02:12:36Z',
  u'locations': [{u'name': u'United States', u'woeid': 23424977}],
  u'trends': [{u'name': u'#MahoganyRT',
               u'promoted_content': None,
               u'query': u'%23MahoganyRT',
               u'url': u'http://twitter.com/search?q=%23MahoganyRT'},
              {u'name': u'#romeoplaymmmyeah',
               u'promoted_content': None,
               u'query': u'%23romeoplaymmmyeah',
               u'url': u'http://twitter.com/search?q=%23romeoplaymmmyeah'},
              {u'name': u'#ADayLateAndADollarShort',
               u'promoted_content': None,
               u'query': u'%23ADayLateAndADollarShort',
               u'url': u'http://twitter.com/search?q=%23ADayLateAndADollarShort'},
              {u'name': u'Westbrook and Durant',
               u'promoted_content': None,
               u'query': u'%22Westbrook+and+Durant%22',
               u'url': u'http://twitter.com/search?q=%22Westbrook+and+Durant%22'},
              {u'name': u'#WerdumvsBrowne',
               u'promoted_content': None,
               u'query': u'%23WerdumvsBrowne',
               u'url': u'http://twitter.com/search?q=%23WerdumvsBrowne'},
              {u'name': u'#TenCommandments',
               u'promoted_content': None,
               u'query': u'%23TenCommandments',
               u'url': u'http://twitter.com/search?q=%23TenCommandments'},
              {u'name': u'Travis Browne',
               u'promoted_content': None,
               u'query': u'%22Travis+Browne%22',
               u'url': u'http://twitter.com/search?q=%22Travis+Browne%22'},
              {u'name': u'Tomorrow is Easter',
               u'promoted_content': None,
               u'query': u'%22Tomorrow+is+Easter%22',
               u'url': u'http://twitter.com/search?q=%22Tomorrow+is+Easter%22'},
              {u'name': u'The Devil Wears Prada',
               u'promoted_content': None,
               u'query': u'%22The+Devil+Wears+Prada%22',
               u'url': u'http://twitter.com/search?q=%22The+Devil+Wears+Prada%22'},
              {u'name': u'Derek Fisher',
               u'promoted_content': None,
               u'query': u'%22Derek+Fisher%22',
               u'url': u'http://twitter.com/search?q=%22Derek+Fisher%22'}]}]

Do any of these trends intersect?¶

world_trends_set = set([trend['name'] for trends in world_trends
                        for trend in trends['trends']])

us_trends_set = set([trend['name'] for trends in us_trends
                     for trend in trends['trends']]) 

world_trends_set.intersection(us_trends_set)

{u'#MahoganyRT'}

Twitter Statuses

q = world_trends[0]['trends'][0]['name']

count = 100

# See https://dev.twitter.com/docs/api/1.1/get/search/tweets

search_results = twitter_api.search.tweets(q=q, count=count)

statuses = search_results['statuses']

len(statuses)

100

More?¶

search_results['search_metadata'].get('next_results')

u'?max_id=457704145387257855&q=%23SiempreMePasaEstoDe&count=100&include_entities=1'

if 'next_results' in search_results['search_metadata']:
    next_results = search_results['search_metadata']['next_results']
    kwargs = dict([ kv.split('=') for kv in next_results[1:].split("&") ])
    search_results = twitter_api.search.tweets(**kwargs)
    statuses += search_results['statuses']

len(statuses)

200

Mongo

from the command line:

$ mkdir -p data/db
$ mongod --dbpath data/db

Connect to our Mongo database management server (DBMS):¶

c = MongoClient()

Create a database called `twitter`:¶

db = c.twitter

Yes, it's that easy.

Insert the results we pulled from Twitter and store them as a document in a collection named `tweets`:¶

statuses_ids = db.tweets.insert(statuses)
statuses_ids[:5]

[ObjectId('53532e26bcdefe1199f65732'),
 ObjectId('53532e26bcdefe1199f65733'),
 ObjectId('53532e26bcdefe1199f65734'),
 ObjectId('53532e26bcdefe1199f65735'),
 ObjectId('53532e26bcdefe1199f65736')]

`ObjectId`s¶

ObjectId is a 12-byte BSON type, constructed using:

a 4-byte value representing the seconds since the Unix epoch,
a 3-byte machine identifier,
a 2-byte process id, and
a 3-byte counter, starting with a random value.

In MongoDB, documents stored in a collection require a unique _id field that acts as a primary key. Because ObjectIds are small, most likely unique, and fast to generate, MongoDB uses ObjectIds as the default value for the _id field if the _id field is not specified.

Using ObjectIds for the _id field provides the following additional benefits:

you can access the insertion time of the ObjectId, using the .generation_time property in pymongo.
sorting on an _id field that stores ObjectId values is roughly equivalent to sorting by insertion time.

Test the contents of our database:¶

c.database_names()

[u'local', u'twitter']

db = c.twitter
db.collection_names()

[u'system.indexes', u'tweets', u'canada', u'us']

db.tweets.find_one()

{u'_id': ObjectId('53531654bcdefe0e658cbf07'),
 u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Sun Apr 20 00:34:04 +0000 2014',
 u'entities': {u'hashtags': [{u'indices': [11, 22], u'text': u'NashTo2Mil'},
   {u'indices': [23, 34], u'text': u'NashTo2Mil'},
   {u'indices': [35, 46], u'text': u'Nash2ToMil'},
   {u'indices': [47, 58], u'text': u'NashTo2Mil'},
   {u'indices': [59, 70], u'text': u'NashTo2Mil'},
   {u'indices': [71, 82], u'text': u'Nash2ToMil'},
   {u'indices': [83, 94], u'text': u'NashTo2Mil'},
   {u'indices': [95, 106], u'text': u'NashTo2Mil'},
   {u'indices': [107, 118], u'text': u'Nash2ToMil'}],
  u'symbols': [],
  u'urls': [],
  u'user_mentions': [{u'id': 310072711,
    u'id_str': u'310072711',
    u'indices': [0, 10],
    u'name': u'Nash',
    u'screen_name': u'Nashgrier'}]},
 u'favorite_count': 0,
 u'favorited': False,
 u'geo': None,
 u'id': 457678549789708288L,
 u'id_str': u'457678549789708288',
 u'in_reply_to_screen_name': u'Nashgrier',
 u'in_reply_to_status_id': None,
 u'in_reply_to_status_id_str': None,
 u'in_reply_to_user_id': 310072711,
 u'in_reply_to_user_id_str': u'310072711',
 u'lang': u'it',
 u'metadata': {u'iso_language_code': u'it', u'result_type': u'recent'},
 u'place': None,
 u'retweet_count': 0,
 u'retweeted': False,
 u'source': u'<a href="https://twitter.com/download/android" rel="nofollow">Twitter for  Android</a>',
 u'text': u'@Nashgrier #NashTo2Mil #NashTo2Mil #Nash2ToMil #NashTo2Mil #NashTo2Mil #Nash2ToMil #NashTo2Mil #NashTo2Mil #Nash2ToMil x85',
 u'truncated': False,
 u'user': {u'contributors_enabled': False,
  u'created_at': u'Fri Aug 16 23:27:37 +0000 2013',
  u'default_profile': True,
  u'default_profile_image': False,
  u'description': u'one direction \u2665_\u2665',
  u'entities': {u'description': {u'urls': []}},
  u'favourites_count': 30,
  u'follow_request_sent': False,
  u'followers_count': 61,
  u'following': False,
  u'friends_count': 86,
  u'geo_enabled': False,
  u'id': 1676879275,
  u'id_str': u'1676879275',
  u'is_translation_enabled': False,
  u'is_translator': False,
  u'lang': u'en',
  u'listed_count': 1,
  u'location': u'ireland',
  u'name': u'niamh',
  u'notifications': False,
  u'profile_background_color': u'C0DEED',
  u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png',
  u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png',
  u'profile_background_tile': False,
  u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/1676879275/1393193679',
  u'profile_image_url': u'http://pbs.twimg.com/profile_images/437708680042405889/1Loj3lOe_normal.jpeg',
  u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/437708680042405889/1Loj3lOe_normal.jpeg',
  u'profile_link_color': u'0084B4',
  u'profile_sidebar_border_color': u'C0DEED',
  u'profile_sidebar_fill_color': u'DDEEF6',
  u'profile_text_color': u'333333',
  u'profile_use_background_image': True,
  u'protected': False,
  u'screen_name': u'radicalzniall',
  u'statuses_count': 377,
  u'time_zone': u'Atlantic Time (Canada)',
  u'url': None,
  u'utc_offset': -10800,
  u'verified': False}}

Notice the _id included in the document along with the values we already saw before.

Now that we have our data in MongoDB, we can use some of it's search functionality. For example:

popular_tweets = db.tweets.find({'retweet_count': {"$gte": 3}})
popular_tweets.count()

52

Making Sense with Pandas

pd.DataFrame(db.tweets.find(fields=['created_at', 'retweet_count', 'favorite_count']))

---------------------------------------------------------------------------
PandasError                               Traceback (most recent call last)
<ipython-input-17-87a1cfed5405> in <module>()
----> 1 pd.DataFrame(db.tweets.find(fields=['created_at', 'retweet_count', 'favorite_count']))

/Users/alessandro.gagliardi/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    271                                          copy=False)
    272             else:
--> 273                 raise PandasError('DataFrame constructor not properly called!')
    274 
    275         NDFrame.__init__(self, mgr, fastpath=True)

PandasError: DataFrame constructor not properly called!

retweet_favorites = pd.DataFrame(list(db.tweets.find(fields=['created_at','retweet_count','favorite_count'])))
retweet_favorites.head()

.describe() is a useful method to get the gist of our data.

retweet_favorites.describe()

However, when applied to a DataFrame, it only describes numeric columns.

retweet_favorites.dtypes

_id               object
created_at        object
favorite_count     int64
retweet_count      int64
dtype: object

.describe() can be called on individual columns (i.e. Series), even if they are not numeric.

retweet_favorites.created_at.describe()

count                                400
unique                               117
top       Sun Apr 20 00:33:55 +0000 2014
freq                                  16
Name: created_at, dtype: object

However, in this case created_at is being treated as a string, which is not very helpful. We can fix that with pandas.to_datetime:

retweet_favorites.created_at.map(pd.to_datetime).describe()

count                     400
unique                    117
first     2014-04-20 00:33:45
last      2014-04-20 02:16:53
top       2014-04-20 00:33:55
freq                       16
Name: created_at, dtype: object

Not all that interesting though, since all of these tweets were collected within a couple seconds of each other.

Mongo allows us to access subfields directly.

mentions_followers = list(db.tweets.find(fields=['entities.user_mentions', 'user.followers_count']))
pd.DataFrame(mentions_followers).head()

Pandas doesn't know how to parse the sub-documents however, so we must tell it explicitly:

mentions_followers_df = pd.DataFrame({'user_mentions': len(tweet['entities']['user_mentions']), 
              'followers_count': tweet['user'].get('followers_count')} for tweet in mentions_followers)
mentions_followers_df.head()

Perhaps user_mentions and followers_count are correlated?

plt.scatter(mentions_followers_df.user_mentions, mentions_followers_df.followers_count)
mentions_followers_df.corr()

Perhaps not.

Lab

1-2 Pairs

Raise two fingers if you understand the material well.
Raise one finger if you understand this material OK.
Now, 1 find a 2 and sit next to them. While you work through this exercise, only 1's can type.

The Yahoo! Where On Earth ID of Canada is 23424775.
Use it to find Twitter trends in Canada and compare it to US trends. What's the difference between them?

Insert the last 100 tweets for all Canada-only trends into a canada collection.
Insert the last 100 tweets for all US-only trends into a us collections.
For this (small and not very representative) sample, which users have the highest follower_count?

Individually:

Find tweets where 'retweet_count': {"$gt": 0}.
What is their count?
Is there anything about them that stands out as retweetable?

Further exercises:

Twitter - Mining-the-Social-Web-2nd-Edition: Chapter 1 - Mining Twitter
Mongo - Aggregation Framework

	favorite_count	retweet_count
count	400.0000	400.000000
mean	0.0150	125.025000
std	0.1408	1578.105185
min	0.0000	0.000000
25%	0.0000	0.000000
50%	0.0000	0.000000
75%	0.0000	1.000000
max	2.0000	22327.000000

	_id	entities	user
0	53531654bcdefe0e658cbf07	{u'user_mentions': [{u'indices': [0, 10], u'sc...	{u'followers_count': 61}
1	53531654bcdefe0e658cbf08	{u'user_mentions': [{u'indices': [3, 13], u'sc...	{u'followers_count': 635}
2	53531654bcdefe0e658cbf09	{u'user_mentions': [{u'indices': [0, 10], u'sc...	{u'followers_count': 32}
3	53531654bcdefe0e658cbf0a	{u'user_mentions': [{u'indices': [0, 10], u'sc...	{u'followers_count': 14}
4	53531654bcdefe0e658cbf0b	{u'user_mentions': []}	{u'followers_count': 18}

Lab

Twitter & MongoDB

Importing Libraries

Twitter Trends

Connect to Twitter:¶

Do any of these trends intersect?¶

Twitter Statuses

More?¶

Mongo

Connect to our Mongo database management server (DBMS):¶

Create a database called `twitter`:¶

Yes, it's that easy.

Insert the results we pulled from Twitter and store them as a document in a collection named `tweets`:¶

`ObjectId`s¶

Test the contents of our database:¶

Making Sense with Pandas

Lab

1-2 Pairs

Individually:

Next week:

Exploring Data

	_id	created_at	retweet_count
0	53531654bcdefe0e658cbf07	Sun Apr 20 00:34:04 +0000 2014	0
1	53531654bcdefe0e658cbf08	Sun Apr 20 00:34:04 +0000 2014	22327
2	53531654bcdefe0e658cbf09	Sun Apr 20 00:34:04 +0000 2014	0
3	53531654bcdefe0e658cbf0a	Sun Apr 20 00:34:04 +0000 2014	0
4	53531654bcdefe0e658cbf0b	Sun Apr 20 00:34:04 +0000 2014	0

	followers_count	user_mentions
followers_count	1.000000	-0.065969
user_mentions	-0.065969	1.000000

Lab

Twitter & MongoDB

Importing Libraries

Twitter Trends

Connect to Twitter:¶

Do any of these trends intersect?¶

Twitter Statuses

More?¶

Mongo

Connect to our Mongo database management server (DBMS):¶

Create a database called twitter:¶

Yes, it's that easy.

Insert the results we pulled from Twitter and store them as a document in a collection named tweets:¶

ObjectIds¶

Test the contents of our database:¶

Making Sense with Pandas

Lab

1-2 Pairs

Individually:

Next week:

Exploring Data

Create a database called `twitter`:¶

Insert the results we pulled from Twitter and store them as a document in a collection named `tweets`:¶

`ObjectId`s¶