In the previous step I compiled a list of all the Twitter handles from colleges in our sample on Twitter. The next step is to obtain all tweets created by each handle. This is tricky because the Twitter API limits the number of Tweets you can pull. Thankfully, the developers of the twint Python package have figured out how to circumvent these limits by not using the API at all!
All tweets are output to a .json file for each college's main handle and admissions handle.
import pandas as pd
import numpy as np
import os, requests, re, time
import twint
base_path = r"C:\Users\laure\Dropbox\!research\20181026_ihe_diversity"
tw_path = os.path.join(base_path,'data','twitter_handles')
ihe_handles = pd.read_pickle(os.path.join(tw_path, "tw_df_final"))
ihe_handles[ihe_handles.adm_handle == ''].head(2)
def scrape_tweets(username, csv_name):
# Configure
c = twint.Config()
c.Username = username
c.Custom = ['id', 'date', 'time', 'timezone', 'user_id', 'username', 'tweet', 'replies',
'retweets', 'likes', 'hashtags', 'link', 'retweet', 'user_rt', 'mentions']
c.Store_csv = True
c.Output = os.path.join(tw_path, csv_name)
# Start search
twint.run.Profile(c)
for index, row in ihe_handles.iterrows():
# CSV file names
main_f_name = os.path.join(tw_path, str(index) + '_main.csv')
adm_f_name = os.path.join(tw_path, str(index) + '_adm.csv')
# Handles
main_handle = row['main_handle']
adm_handle = row['adm_handle']
if main_handle != '':
scrape_tweets(main_handle, main_f_name)
if adm_handle != '':
scrape_tweets(adm_handle, adm_f_name)