In the previous step we obtained all tweets for our sample of colleges and universities. Now, however, we must process the messy tweet text for analysis. There are many different approaches to cleaning tweet data for text analysis and topic modelling. For example, some authors combine Tweets with the same hashtag into a single document. Upon inspection, however, I decided that would not be helpful in the context of college and university's social media use. This is because many universities use hashtags that are just that college's name (too generic and frequent) e.g. #somecollege or specific university hashtags that are not used frequently (e.g. #somecollegehomecoming).
Additionally, some colleges use only a hashtag when mentioning the topic I am most interested in for this work -- #diversity, while others do not use a hashtag, e.g. "We support #womenoncampus" vs "We support women on campus." Because of this, I use the wordsegment
package to attempt to split multi-word hashtags. Note however that this is not a perfect process when words overlap.
Finally, because this analysis is used in words used across colleges, not just at a particular college, I remove words that are only used by one college in the data. This helps with the problem mentioned above where colleges hashtag their own names. Their own names will occur frequently in the data, but not frequently enough to be removed by standard text processing techniques, such as removing words that occur in over a certain percentage of documents.
import pandas as pd
import numpy as np
from collections import defaultdict
The preprocessor
package will remove emojis, mentions, and URLs from tweets.
# Note: There is a bug installing this package in windows that was fixed by a kind user on
# Github. Windows users should download as follows
# pip install git+git://github.com/iamRusty/preprocessor
import preprocessor as pre
import re, glob, datetime
The wordsegment
package splits multiword hashtags into individual words.
from wordsegment import load, segment
# Help with memory issues
import gc
The unicodedata
package helps to convert unicode to ascii.
import unicodedata2
The nltk
and gensim
packages are used for stemming and removing stop words.
# Tools for LDA
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
# Test data
twitter_files = glob.glob('../data/twitter_data/*.json')
test_file = twitter_files[1]
test_df = pd.read_json(test_file, encoding='utf-8', lines=True)
# Test tweet
test_tweet = test_df.tweet[99]
print(test_tweet)
# Load segmentation dictionary
load()
# Set options for Twitter processing - remove url, mention, and emojis
pre.set_options(pre.OPT.URL, pre.OPT.EMOJI, pre.OPT.MENTION, pre.OPT.SMILEY)
The following function will clean the hashtags, removing hashes and numbers, as well as segmenting out individual words.
# Hash fix - will be called from within tweet cleaning function
def hash_fix(h):
h1 = re.sub(r'[0-9]+', '', h)
h2 = re.sub(r'#', '', h1)
h3 = segment(str(h2))
h4 = ' '.join(map(str, h3))
return h4
hash_fix('#UAHDiscoveryDay2017')
Create a dictionary that contains each hash tag and the text it should be replaced with.
# Inputs: dataframe with the tweets and the column with the hashtags
def hash_dict(df,hash_col):
# Create a datafame of all hashtags in a column and their counts
# Note: hashtags are in lists inside a cell e.g. [#hash1, #hash2]
tag_counts = df[hash_col].apply(pd.Series).stack().value_counts().to_frame()
tag_counts = tag_counts.reset_index()
tag_counts.columns = ['hash','freq']
# Remove numbers and segment multiple words using hash fix
tag_counts = tag_counts.assign(clean_tag = tag_counts.hash.apply(lambda x: hash_fix(x)))
# Create a dictionary of the hashtags and their clean strings
tag_counts.set_index('hash', inplace=True)
tag_dict = tag_counts['clean_tag'].to_dict()
return tag_dict
tag_dict = hash_dict(test_df,'hashtags')
tag_dict
test_df.tweet.replace(tag_dict, regex=True)
# Function to clean tweet dataframe
def clean_tweets(df, drop_cols):
# Create dictionary of "cleaned" hashtags
tag_dict = hash_dict(df,'hashtags')
# Create column with clean tweets
df = df.assign(clean_tw = df.tweet.apply(lambda x: pre.clean(str(x))))
# Replace hashtags in clean tweets using dictionary
df = df.assign(clean_tw = df.clean_tw.replace(tag_dict, regex=True))
# Drop unused cols
df = df.drop(drop_cols, axis=1)
return df
# Example: Clean DF
# Columns to drop
drop_cols = ['created_at', 'gif_thumb', 'is_quote_status', 'is_reply_to',
'location', 'mentions', 'name', 'place', 'quote_id', 'quote_url',
'replies', 'retweet', 'tags', 'time', 'timezone']
new_df = clean_tweets(test_df, drop_cols)
new_df.clean_tw.head(5)
# Get list of JSON files
twitter_files = glob.glob('../data/twitter_data/*.json')
print(len(twitter_files)/100)
# Note: Having memory issues so I'm breaking the file list into chunks
# If you have files you need to run individually, set no_tricks = False
# and put your "tricky" file into line 9
def appended_files(file_list, chunk_num, drop_cols, no_tricks):
appended_data = []
for file in file_list:
# Print file and time for debugging purposes
print(str(re.findall(r"[0-9].*", str(file))[0]) + ' - ' + str(datetime.datetime.now()))
tw_df = pd.read_json(file, encoding='utf-8', lines=True)
# Code is hanging on this large file - skip cleaning it
if (no_tricks == True and str(re.findall(r"[0-9].*", str(file))[0]) == '217156_main.json'):
continue
# Clean dataframe using function from previous step
tw_df = clean_tweets(tw_df, drop_cols)
# Extract ID number
id_num = ''.join([i for i in str(file) if i.isdigit()])
# Add ID number to DF
tw_df.loc[:,'ipeds_id']=id_num
# Add indicator for main page vs admissions page
if 'main' in str(file):
tw_df.loc[:,'main_page']=1
elif 'adm' in str(file):
tw_df.loc[:,'main_page']=0
else:
pass
# Reset index
tw_df.reset_index(drop=True, inplace=True)
# Add to list to append
if (len(file_list) > 1):
appended_data.append(tw_df)
# Run garbage collection
gc.collect()
# Concatenate files in list
if (len(file_list) > 1):
tw_df = pd.concat(appended_data, ignore_index=True)
# Save concatenated files to pickle
tw_df.to_pickle(path=r'../data/twitter_data/pickle/concat_tw_' + str(chunk_num) + '.pkl')
# Delete object
del tw_df
# Create a function to create "chunks" of the list of files
# https://stackoverflow.com/questions/434287/what-is-the-most-pythonic-way-to-iterate-over-a-list-in-chunks
def chunker(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
# Columns to drop
drop_cols = ['created_at', 'gif_thumb', 'is_quote_status', 'is_reply_to',
'location', 'mentions', 'name', 'place', 'quote_id', 'quote_url',
'replies', 'retweet', 'tags', 'time', 'timezone']
gc.collect()
# Clean and append files in chunks
# Note: Computer hanging up - do this in chunks
# If process hangs and you need to restart at a chunk, change twitter files to
# twitter_files[start_file_number:] and i to the number of the chunk
# Example: i=5 twitter_files[250:] will start at the 5th chunk for chunks of 50
i = 1
for group in chunker(twitter_files, 50):
chunk_str = "%02d" % (i,)
appended_files(group, chunk_str, drop_cols, no_tricks=True)
# Run garbage collection
gc.collect()
i += 1
# For some reason one .json file will not process. Tried stripping non-ascii characters.
# It will have to be left out for now.
# Run one large, pesky file separately - 217156_main.json
#appended_files(['../data/twitter_data/217156_fixed_main.json'], '217156', drop_cols, no_tricks=False)
# Concatenate only the cleaned tweets from the cleaned pkl files
# Saves memory space
gc.collect()
pkl_files = glob.glob(r'../data/twitter_data/pickle/concat_tw_*.pkl')
append_tweets = []
for file in pkl_files:
print(str(file))
tw_df = pd.read_pickle(file)
tw_df = tw_df[['ipeds_id', 'id', 'clean_tw']]
append_tweets.append(tw_df)
del tw_df
gc.collect()
tw_final = pd.concat(append_tweets, ignore_index=True)
tw_final.to_pickle(path=r'../data/clean/clean_tweets_full.pkl')
# Load the pickled data
tw_data = pd.read_pickle(path=r'../data/clean/clean_tweets_full.pkl')
Unfortunately there are still non-ascii characters - let's remove them now.
def convert_unicode(text):
return unicodedata2.normalize('NFKD', text).encode('ascii', 'ignore')
tw_data = tw_data.assign(clean_tw = tw_data.clean_tw.apply(convert_unicode))
Preprocessing code comes from link below. Though, note the author forgot to define stemmer.
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# Stem and remove stop words
stemmer = SnowballStemmer("english", ignore_stopwords=True)
def lemmatize_stemming(text):
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
result.append(lemmatize_stemming(token))
return result
tw_data = tw_data.assign(words = tw_data['clean_tw'].map(preprocess))
gc.collect()
# Save a copy in case you need to reload
tw_data.to_pickle(path=r'../data/clean/pre_processed.pkl')
tw_data = pd.read_pickle(path=r'../data/clean/pre_processed.pkl')
# Create a copy of the data with only the processed data, id, and school id
tw_words = tw_data[['ipeds_id', 'id', 'words']].copy()
del tw_data
gc.collect()
np.random.seed(12345)
# Create a column with random number - will be used to subset the data
tw_words = tw_words.assign(rand_int = np.random.randint(0, 99, tw_words.shape[0]))
# Remove words used by only one school - this will help to filter out school names and mascots
# This is tricky again because of memory issues
# Stack the words in chunks - remove words that are used by multiple schools
# Final result will be list of words used only by one school
# Must have list of words in tweet in column called 'words'
def single_sch_words(df):
# Stack the data - create row for each word in list
df = df.set_index(['id'])
df.sort_index(inplace=True)
s = df.apply(lambda x: pd.Series(x['words']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'word'
tw_data_stacked = df.drop('words', axis=1).join(s)
del s
gc.collect()
# Keep only unique instances of each word and school
tw_data_stacked = tw_data_stacked.drop_duplicates(subset=['word', 'ipeds_id'], keep=False)
# Create count of number of colleges that use each word
num_colleges = tw_data_stacked.groupby('word').agg({'ipeds_id': lambda x: x.nunique()})
num_colleges.name = 'word'
num_colleges.reset_index()
num_colleges.rename(index=str, columns={'ipeds_id':'school_count'}, inplace=True)
tw_data_stacked = tw_data_stacked.join(num_colleges,on='word')
del num_colleges
gc.collect()
# Keep words used by only one college
tw_data_stacked.reset_index()
tw_data_stacked = tw_data_stacked.loc[tw_data_stacked.school_count == 1]
tw_data_stacked.drop('school_count', axis=1, inplace=True)
gc.collect()
return tw_data_stacked
# Datasets of single use words for random chunks of the data
# Example using 1% of data
single_words1 = single_sch_words(tw_words.loc[tw_words.rand_int < 1])
single_words1.head(5)
# Get single words from random samples of 3% of data at a time then append
append_single = []
for i in range(0,102,3):
j = i + 2
print('Start at: ' + str(i) + ' End at: ' + str(j) + ' - ' + str(datetime.datetime.now()))
df = single_sch_words(tw_words.loc[(tw_words.rand_int >= i) & (tw_words.rand_int <= j)])
append_single.append(df)
gc.collect()
# Combine the three sets of single words, see if any are repeats
single_words_all = pd.concat(append_single, ignore_index=True)
# Save in case of crash
single_words_all.to_pickle(path=r'../data/clean/single_words.pkl')
single_words_all = pd.read_pickle(path=r'../data/clean/single_words.pkl')
# On further inspection, many of the single words look like mentions that weren't removed with the Twitter preprocessor.
single_words_all.head(10)
# On the combined dataset of random samples, count the number of schools that use each word
num_colleges = single_words_all.groupby('word').agg({'ipeds_id': lambda x: x.nunique()})
num_colleges.name = 'word'
num_colleges.reset_index()
num_colleges.rename(index=str, columns={'ipeds_id':'school_count'}, inplace=True)
single_words_all = num_colleges
single_words_all = single_words_all.loc[single_words_all.school_count == 1]
single_words_all.head(10)
single_words_all = single_words_all.reset_index()
single_words_all.columns
single_word_list = single_words_all.word.values.tolist()
single_word_list[0:10]
Remove words used by only one college from the dataset.
tw_words.drop(['rand_int'], inplace=True, axis=1)
def remove_single(old_list, not_list):
return [x for x in old_list if x not in not_list]
# This takes a very very very long time to run
# Did not have time to finish running, sadly
tw_words = tw_words.assign(words = tw_words.words.apply(lambda x: remove_single(x, single_word_list)))
tw_words.to_pickle(path=r'../data/clean/clean_tweets_final.pkl')
tw_data = pd.read_pickle(path=r'../data/clean/pre_processed.pkl')
tw_words = tw_data[['ipeds_id', 'id', 'words']].copy()
del tw_data
gc.collect()
tw_words.drop(['ipeds_id'], axis=1, inplace=True)
tw_words.head(5)
gc.collect()
pkl_files = glob.glob(r'../data/twitter_data/pickle/concat_tw_*.pkl')
append_tweets = []
for file in pkl_files:
print(str(file))
tw_df = pd.read_pickle(file)
tw_df = tw_df[['ipeds_id', 'id', 'date','tweet','likes_count','photos']]
append_tweets.append(tw_df)
del tw_df
gc.collect()
diversity_df = pd.concat(append_tweets, ignore_index=True)
diversity_df = pd.merge(diversity_df, tw_words, on=['id'])
del tw_words
gc.collect()
def list_contains(cell_list, word_list):
if [i for i in cell_list if i in word_list]:
return True
else:
return False
print(list_contains(['1','2','3'], ['3']))
print(list_contains(['1','2','3'], ['4']))
# Identify diversity-related tweets
div_words = ['divers', 'multicultur']
diversity_df = diversity_df.assign(diversity_flag = diversity_df.words.apply(lambda x: list_contains(x,div_words)))
# Identify race-related tweets
race_words = ['black', 'african', 'asian', 'hispan', 'latino', 'latina']
diversity_df = diversity_df.assign(race_flag = diversity_df.words.apply(lambda x: list_contains(x,race_words)))
# Identify gender-related tweets
gender_words = ['woman', 'women','gender']
diversity_df = diversity_df.assign(gender_flag = diversity_df.words.apply(lambda x: list_contains(x,gender_words)))
diversity_df[diversity_df.gender_flag==True].tweet.head(5)
gc.collect()
diversity_df.columns
len(diversity_df.index)
# Save data in chunks
diversity_df[0:1000000].to_pickle(path=r'../data/clean/diversity_full_01.pkl')
gc.collect()
diversity_df[1000001:2000000].to_pickle(path=r'../data/clean/diversity_full_02.pkl')
gc.collect()
diversity_df[2000001:3000000].to_pickle(path=r'../data/clean/diversity_full_03.pkl')
gc.collect()
diversity_df[3000001:4000000].to_pickle(path=r'../data/clean/diversity_full_04.pkl')
gc.collect()
diversity_df[4000001:5000000].to_pickle(path=r'../data/clean/diversity_full_05.pkl')
gc.collect()
diversity_df[5000001:].to_pickle(path=r'../data/clean/diversity_full_06.pkl')