Welcome! This project examines the use of Twitter by four-year colleges and universities. Specifically, I examine Tweets related to race and gender diversity. This page is focused on the analysis of the data. At the bottom of this page you will find links to tutorials to guide you through reproducing this analysis or recreating it with your own data.
The project repository can be found at https://github.com/LaurenDahlin/colleges_on_social_media.
The data in this analysis comes from all original (non-retweet) Twitter postings made by four-year colleges on their main Twitter pages and admissions Twitter pages (if they have a separate admissions page). The graph below shows the growth in tweeting by colleges from 2008-2017.
import pandas as pd
import numpy as np
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
plotly.tools.set_credentials_file(username='your.username', api_key='your.api.key')
import gc, glob, re
pkl_files = glob.glob(r'../data/clean/diversity_full_*.pkl')
append_tweets = []
for file in pkl_files:
tw_df = pd.read_pickle(file)
append_tweets.append(tw_df)
del tw_df
gc.collect()
diversity_df = pd.concat(append_tweets, ignore_index=True)
diversity_df = diversity_df.assign(year = diversity_df.date.astype(str).str[:4].astype(int))
year_summary = diversity_df.groupby('year').agg({'ipeds_id': lambda x: x.nunique(), 'id': 'count'})
year_summary.reset_index(level=0, inplace=True)
year_summary = year_summary[year_summary.year<2018]
year_summary.rename(index=str, columns={'ipeds_id':'School Count', 'id':'Tweet Count'}, inplace=True)
trace1 = go.Scatter(
x=year_summary.year,
y=year_summary['School Count'],
name='School Count'
)
trace2 = go.Scatter(
x=year_summary.year,
y=year_summary['Tweet Count'],
name='Tweet Count',
yaxis='y2'
)
data = [trace1, trace2]
layout = go.Layout(
title='Total Number of Colleges and Tweets by Year',
yaxis=dict(
title='Number of Colleges'
),
yaxis2=dict(
title='Number of Tweets',
overlaying='y',
side='right'
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename = "tweets_over_time")
We are seeing a slight decline in the number of tweets produced by colleges in recent years. This likely reflects a shift in the number of platforms being used by colleges to reach students. For example, many colleges are now also posting on Instagram.
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
# Data is too big for memory. Take a random sample of 10%
np.random.seed(12345)
# Create a column with random number - will be used to subset the data
diversity_df = diversity_df.assign(rand_int = np.random.randint(0, 99, diversity_df.shape[0]))
diversity_df = diversity_df.assign(words=diversity_df.words.astype(str))
text = " ".join(tw for tw in diversity_df[diversity_df.rand_int<10].words)
text = re.sub(r'([^\s\w]|_)+', '', text)
# https://www.datacamp.com/community/tutorials/wordcloud-python
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
Perhaps not surprisingly, most of these words are very generic words related to college and accolades: student, learn, congrat (congratulations), thank, campu (campus), and communiti (community). Note that the words are stemmed from earlier pre-processing of the data.
Now, let's look at the diversity-specific tweets. I have flagged tweets that mention "diversity" and "multicultural", as well as tweets with race ('black', 'african', 'asian', 'hispan', 'latino', 'latina') and gender ('woman', 'women','gender') -related words. How have the frequency of these words changed over time?
diversity_df = diversity_df.assign(year = diversity_df.date.astype(str).str[:4].astype(int))
year_summary2 = diversity_df.groupby('year').agg({'diversity_flag': 'sum', 'race_flag': 'sum', 'gender_flag': 'sum'})
year_summary2.reset_index(level=0, inplace=True)
year_summary2 = year_summary2[year_summary2.year<2018]
year_summary2
trace1 = go.Scatter(
x=year_summary2.year,
y=year_summary2.diversity_flag,
name='Diversity Tweet Count'
)
trace2 = go.Scatter(
x=year_summary2.year,
y=year_summary2.race_flag,
name='Race Tweet Count'
)
trace3 = go.Scatter(
x=year_summary2.year,
y=year_summary2.gender_flag,
name='Gender Tweet Count'
)
data = [trace1, trace2, trace3]
layout = go.Layout(
title='Total Number of Diversity-Related Tweets by Year',
yaxis=dict(
title='Number of Tweets'
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename = "diversity_tweets_over_time")
Interestingly, the number of gender-related tweets has decreased in recent years. This may reflect the fact that women are now the majority of applicants and degree-holders at U.S. universities.
Note that flagging based on keywords alone does not produce a perfect match to diversity-related tweets.
pd.set_option('max_colwidth',200)
for index,row in diversity_df[(diversity_df.rand_int<1) & (diversity_df.diversity_flag==True)][:20].iterrows():
print(row['tweet'])
from IPython.display import Image
from IPython.core.display import HTML
for index,row in diversity_df[(diversity_df.rand_int<1) & (diversity_df.diversity_flag==True)][:50].iterrows():
if row['photos'] != []:
display(Image(row['photos'][0], unconfined=True))
There are many kinds of images associated with diversity in these tweets. They range from flyers for diversity-related events to stereotypical photos of college students of different backgrounds working and studying together.