%matplotlib inline
import gzip, pickle, collections
import umap
import pandas as pd
import plotly.offline as plotly
import plotly.graph_objs as go
import numpy as np
import matplotlib.pyplot as plt
plotly.init_notebook_mode()
Here is a cleaned up version of media source ideology estimation from Barbera (2015). Specifically, I have cleaned up the following:
I think we should be calling this an estimate of a media source's "content alignment" to a set of users rather than the sources's estimated ideology. From Bakshy (2015): "Alignment is not a measure of media slant; rather, it captures differences in the kind of content shared among a set of partisans, which can include topic matter, framing, and slant."
Here's a breakdown of our dataset:
Users w/ est. ideology: 2,926,841
Users in sample: 30,000
Total tweets: 45,316,914
Tweets w/ URLs: 19,563,609
Retweets w/ URLs: 13,008,418
Original tweets w/ URLs: 6,555,488
Users w/ (re)tweets w/ URLs: 17,171
Things look pretty similar to last time and look like they make a good deal of sense. We could still benefit from more data. I did a clustering of sites based on the distributions of their users' ideologies, and it looks interesting and useful. The data can be downloaded here.
Let's see what the new ideology estimates look like.
est_ideo = pd.read_csv('../data/ideology-estimates-20180705.csv.gz', index_col=0)
est_ideo['normed_theta'] = (est_ideo['theta'] - est_ideo['theta'].mean()) / est_ideo['theta'].std() * -1
display(est_ideo.describe())
_ = est_ideo['normed_theta'].plot.hist(bins=30)
Observations
Let's get the rest of the data together.
datasets = {}
with open('../all_sub.pkl', 'rb') as f:
datasets['subdomain_to_users_all'] = pickle.load(f)
with open('../orig_sub.pkl', 'rb') as f:
datasets['subdomain_to_users_orig'] = pickle.load(f)
with open('../rt_sub.pkl', 'rb') as f:
datasets['subdomain_to_users_rt'] = pickle.load(f)
with open('../all_reg.pkl', 'rb') as f:
datasets['domain_to_users_all'] = pickle.load(f)
with open('../orig_reg.pkl', 'rb') as f:
datasets['domain_to_users_orig'] = pickle.load(f)
with open('../rt_reg.pkl', 'rb') as f:
datasets['domain_to_users_rt'] = pickle.load(f)
def create_dataframe(domain_to_users):
MIN_USERS = 30
domain_to_ideo = collections.defaultdict(list)
for domain, uids in domain_to_users.items():
for uid in uids:
try:
user_ideo = est_ideo.at[uid, 'normed_theta']
except KeyError:
continue
domain_to_ideo[domain].append(user_ideo)
dom_ideo = pd.DataFrame.from_dict({k:v for k,v in domain_to_ideo.items() if len(v) >= MIN_USERS}, orient='index').T
dom_ideo_means = pd.DataFrame(dom_ideo.mean(), columns=['mean_ideology'])
domain_to_num_users = pd.DataFrame.from_dict(
{d: len(u) for d, u in domain_to_users.items()},
orient='index', columns=['num_uniq_users'])
BINS = 5
RANGE = (dom_ideo.min().min(), dom_ideo.max().max())
dom_ideo_means.name = 'mean_ideology'
dom_ideo_hist = pd.DataFrame.from_dict(
{d: np.histogram(ideos.values, bins=BINS, range=RANGE, density=False)[0]
for d, ideos in dom_ideo.T.iterrows()}, orient='index')
dom_ideo_hist.columns = [
'users_left', 'users_center_left', 'users_center', 'users_center_right', 'users_right']
dom_ideo_normed_hist = pd.DataFrame.from_dict(
{d: np.histogram(ideos.values, bins=BINS, range=RANGE, density=True)[0]
for d, ideos in dom_ideo.T.iterrows()}, orient='index')
dom_ideo_normed_hist.columns = [
'normed_left', 'normed_center_left', 'normed_center', 'normed_center_right', 'normed_right']
BINS=20
dom_ideo_hist_big = pd.DataFrame.from_dict(
{d: np.histogram(ideos.values, bins=BINS, range=RANGE, density=False)[0]
for d, ideos in dom_ideo.T.iterrows()}, orient='index')
dom_ideo_hist_big.columns = [f'bin_{i+1}' for i in range(BINS)]
dom_ideo_normed_hist_big = pd.DataFrame.from_dict(
{d: np.histogram(ideos.values, bins=BINS, range=RANGE, density=True)[0]
for d, ideos in dom_ideo.T.iterrows()}, orient='index')
dom_ideo_normed_hist_big.columns = [f'normed_bin_{i+1}' for i in range(BINS)]
df = dom_ideo_means.join(domain_to_num_users)\
.join(dom_ideo_hist)\
.join(dom_ideo_normed_hist)\
.join(dom_ideo_hist_big)\
.join(dom_ideo_normed_hist_big)\
.sort_values(by='num_uniq_users', ascending=False)
return df
dataframes = {}
for name, dataset in datasets.items():
dataframes[name] = create_dataframe(dataset)
for name, df in dataframes.items():
df.to_csv(f'{name}_content_alignment_estimates_20180705.csv', index_label='domain')
display(f'{name}: {df.shape[0]} sites')
I'm going to do all the analysis on the dataset with the most sites: tweets and retweets grouped by subdomain.
site_ideo = dataframes['subdomain_to_users_all']
_ = site_ideo['mean_ideology'].plot.hist(bins=30)
Observations
#domain_ideo_means = dom_ideo.loc[:,dom_ideo.count().sort_values().tail(100).index].mean()
news_media_domains = [
'english.alarabiya.net', 'aljazeera.com', 'americanthinker.com', 'bbc.com',
'bbc.co.uk', 'bloomberg.com', 'bostonglobe.com', 'breitbart.com',
'buzzfeed.com', 'cbc.ca', 'cbsnews.com', 'chicagotribune.com', 'cnbc.com',
'cnn.com', 'csmonitor.com', 'dailycaller.com', 'dailykos.com',
'dailymail.co.uk', 'economist.com', 'forbes.com', 'foreignpolicy.com',
'fortune.com', 'foxnews.com', 'haaretz.com', 'hindustantimes.com',
'huffingtonpost.com', 'huffpost.com', 'independent.co.uk', 'infowars.com',
'latimes.com', 'miamiherald.com', 'motherjones.com', 'msnbc.com',
'nationalreview.com', 'nbcnews.com', 'newsweek.com', 'newyorker.com',
'npr.org', 'nydailynews.com', 'nypost.com', 'nytimes.com', 'pbs.org',
'politico.com', 'propublica.org', 'realclearpolitics.com','reuters.com',
'rollcall.com', 'rt.com', 'salon.com', 'news.sky.com', 'slate.com',
'sputniknews.com', 'theatlantic.com', 'theguardian.com', 'thehill.com',
'time.com', 'usatoday.com', 'vox.com', 'washingtonpost.com',
'washingtontimes.com', 'weeklystandard.com', 'westernjournal.com', 'wsj.com',
'zerohedge.com',
]
non_news_domains = [
'aclu.org', 'change.org', 'cosmopolitan.com', 'facebook.com', 'google.com',
'harvard.edu', 'hbr.org', 'mit.edu', 'patreon.com', 'politifact.com',
'reddit.com', 'reuters.com', 'twitter.com', 'wikileaks.org', 'youtube.com',
]
domains = news_media_domains + non_news_domains
_ = site_ideo.loc[news_media_domains,'mean_ideology'].sort_values(ascending=False).plot.barh(figsize=(10, 20), legend=False)
Observations
So far, we've reduced this big set of numbers we have for each site (the ideology of each user) to a single number (the mean). I'd like to be able to better characterize the audience of these sites. To do that, I'm going to create a histogrammed version of the distribution for each site. That way, each site has an equivalent vector representing it's users' ideologies regardless of the number of users who shared it.
Once I have those histograms, I'm going to give them to a dimensionality reduction algorithm so I can stick the sites on a scatter plot.
dom_ideo_normed_hist = site_ideo.loc[:,['normed_left', 'normed_center_left', 'normed_center', 'normed_center_right', 'normed_right']]
ideo_hist_embedded = umap.UMAP(random_state=1, n_neighbors=50, min_dist=0.05).fit_transform(dom_ideo_normed_hist)
scatter1 = go.Scattergl(
y=ideo_hist_embedded[:, 0],
x=ideo_hist_embedded[:, 1],
mode='markers',
text=dom_ideo_normed_hist.index,
hoverinfo='text',
marker=dict(
color=site_ideo['mean_ideology'].loc[dom_ideo_normed_hist.index],
cmin=-1, cmax=1,
size=site_ideo['num_uniq_users'].loc[dom_ideo_normed_hist.index],
sizemode='area',
sizemin=2,
sizeref=30
)
)
layout1 = go.Layout(
title ='Domain Similarity by Distribution of Audience Ideology',
hovermode = 'closest',
#xaxis = dict(visible = False),
#yaxis = dict(visible = False),
width = 800,
height = 800
)
plotly.iplot(go.Figure(data=[scatter1], layout=layout1))
On this plot, each point represents a media source. Points that are close together have similar audience ideology distributions, and the distance between points corresponds to that similarity. Media sources are colored by the mean audience ideology and sized by the number of unique Twitter users who shared the domain. You can zoom in by clicking and dragging a box around the area you're interested in, and zoom back out by double clicking. The actual coordinates are meaningless - they're just useful when referring to locations.
Observations
I want to know what the actual ideology distributions look like for different parts of this map, so I'll pull out three sites each from a number of different sections.
sampled_domains = [
'digbysblog.blogspot.com', 'lucyforcongress.com', 'curvemag.com', # blue tip near (-6.1, -11.0)
'patagonia.com', 'thedailydemocracy.org', 'factsdomatter.com', # isolated blue edge (-6.3, -6.9)
'mediamatters.org', 'theroot.com', 'rawstory.com', # popular blue sites near (-0.4, -5.0)
'politico.com', 'apnews.com', 'businessinsider.com', # popular light blue sites near (4.1, -0.7)
'foxnews.com', 'rt.com', 'washingtonexaminer.com', # popular gray sites near (4.0, 4.3)
'20minutos.es', 'antena3.com', 'infobae.com', # small blue-gray island near (2.6, 3.6)
'lapatilla.com', 'larepublica.pe', 'larazon.es', # small orange-gray island near (6.2, 7.2)
'dailycaller.com', 'ijr.com', 'navy.mil', # right before red breaks off (3.4, 6.0)
'canoe.com', 'nraila.org', 'gop.com', # right after red breaks off (2.5, 7.8)
'terrencekwilliams.com', 'creepingsharia.wordpress.com', 'borderwallbricks.com', # middle of red island near (0.2, 11.0)
'ignet.gov', 'readthememo.org', 'kennedyforutah.com', # red tip near (-2.2, 11.5)
]
_ = dom_ideo_normed_hist.loc[sampled_domains,:].T.plot(subplots=True, kind='bar', layout=(11,3), figsize=(16,20), sharex=True)
Each row in this array depicts three samples from different parts of the overall graph. The subdomain label is above the graph, which is not obvious. Looking at the legend in each graph helps.
Observations
Distance between audience ideologies is one way to lay out a media source graph. There's a more intutitive way that this team has used in the past to lay out a graph: if two sites are shared by the same person, they should be closer together than two sites that are not. I create that graph below.
def plot(embedded, df):
scatter1 = go.Scattergl(
y=embedded[:,0],
x=embedded[:,1],
mode='markers',
text=df.index,
hoverinfo='text',
marker=dict(
color=site_ideo.reindex(df.index)['mean_ideology'],
cmin=-1, cmax=1,
size=site_ideo.reindex(df.index)['num_uniq_users'],
sizemode='area',
sizemin=2,
sizeref=30
)
)
layout1 = go.Layout(
title ='Site Similarity by Co-tweeting',
hovermode = 'closest',
#xaxis = dict(visible=False),
#yaxis = dict(visible=False),
height=800,
width=800
)
plotly.iplot(go.Figure(data=[scatter1], layout=layout1))
with open('../sharing_patterns/cotweet_dict_5000.pkl', 'rb') as f:
cotweet_dict = pickle.load(f)
cotweets = pd.DataFrame.from_dict({d: pd.Series(e) for d, e in cotweet_dict.items()}, dtype='int64')
# Order of the two domains was not consistent, so add the two triangles together to get full counts
cotweets = cotweets + cotweets.T
cotweets.head(5)
exclude = ['twitter.com', 'cards.twitter.com', '']
wo_twitter = cotweets.drop(index=exclude, columns=exclude).fillna(0)
distances = 1 - wo_twitter / wo_twitter.sum()
distances = (distances - distances.mean()) / distances.std()
display(distances.head(5))
%time embedded = umap.UMAP(metric='precomputed', random_state=1, min_dist=0.05).fit_transform(distances)
plot(embedded, distances)
This graph can be interpreted in much the same way as Gephi graphs, except this layout algorithm is a lot more careful in setting the distances between nodes. Again, points are media sources colored by mean ideology and sized by the number of unique sharers. The actual coordinates are meaningless - they're just useful when referring to locations.
Observations