Updated Jan 11, 2019
In this we combine the data from our 3 samples into one big dataset and compute the mean ideology of sharers for every subdomain. The final dataset ended up consisting of 63,234,726 tweets from 46,245 accounts. If we only consider domains that were shared by 30 or more accounts, we have ideology estimates for 15,459 domains. The estimates are here.
%matplotlib inline
import gzip, pickle, collections, itertools, random
import pandas as pd
import plotly.offline as plotly
import plotly.graph_objs as go
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbs
plotly.init_notebook_mode()
Let's grab all the user data and bin the users. To bin users, first, we pick a center. Then we break up the two sides into 10 bins of equal numbers of people.
user_ideos = pd.read_csv('data/cleaned_user_ideology_estimates_20180705.csv.gz', index_col=0)
uid_to_ideo = dict(user_ideos['normed_theta'].items())
CENTER = 0.2
users = pd.read_pickle('data/all_samples_combined/user_ids.pkl')
left_user_bins, left_bins = pd.qcut(user_ideos[user_ideos['normed_theta'] < CENTER]['normed_theta'], 10, retbins=True, labels=[i for i in range(-10, 0)])
right_user_bins, right_bins = pd.qcut(user_ideos[user_ideos['normed_theta'] > CENTER]['normed_theta'], 10, retbins=True, labels=[i for i in range(1, 11)])
user_bins = left_user_bins.append(right_user_bins)
user_to_bin = user_bins.to_dict()
ax = user_ideos['normed_theta'].plot.hist(bins=200, alpha=0.5, figsize=(12, 6))
ax.set_title('User ideology distribution with bins')
ax.vlines(CENTER, 0, 35000, color='red')
ax.vlines(left_bins, 0, 35000, linestyles='dotted', alpha=0.5)
_ = ax.vlines(right_bins, 0, 35000, linestyles='dotted', alpha=0.5)
print('Bin edges:')
all_bins = list(left_bins)
all_bins.extend(list(right_bins)[1:])
all_bins
print('Users per bin:')
user_bins.value_counts().sort_index()
Now we compute the mean ideology for each subdomain.
%%time
MIN_USERS = 30
s1 = pd.read_pickle('data/all_samples_combined/subdomain_to_users.pkl')
subdomain_to_users = {k: v for k, v in s1.items() if len(v) >= MIN_USERS}
del s1
%%time
max_length = max(len(v) for k,v in subdomain_to_users.items())
s1 = pd.DataFrame.from_dict({k:[uid_to_ideo.get(u, None) for u in v] + ([np.nan] * (max_length - len(v))) for k,v in subdomain_to_users.items()}, dtype='float16')
subdomain_to_num_urls = pd.Series({k: len(v) for k,v in pd.read_pickle('data/all_samples_combined/subdomain_to_urls.pkl').items()})
subdomain_to_user_bins = {subdomain: [user_to_bin.get(uid, None) for uid in users] for subdomain, users in subdomain_to_users.items()}
%%time
data_dict = {
'mean_sharer_ideo': s1.mean(),
'stddev_sharer_ideo': s1.std(),
'num_sharers': s1.count(),
'num_uniq_urls': subdomain_to_num_urls,
}
for i in user_bins.value_counts().sort_index().index:
data_dict[f'num_sharers_in_ideo_bin_{i}'] = {s: len([b for b in subdomain_to_user_bins[s] if b == i]) for s in subdomain_to_user_bins.keys()}
df = pd.DataFrame(data_dict, index=s1.columns)
df.describe()
_ = df['mean_sharer_ideo'].plot.hist(bins=200)
news_media_domains = [
'english.alarabiya.net', 'aljazeera.com', 'americanthinker.com', 'bbc.com',
'bbc.co.uk', 'bloomberg.com', 'bostonglobe.com', 'breitbart.com',
'buzzfeed.com', 'cbc.ca', 'cbsnews.com', 'chicagotribune.com', 'cnbc.com',
'cnn.com', 'csmonitor.com', 'dailycaller.com', 'dailykos.com',
'dailymail.co.uk', 'economist.com', 'forbes.com', 'foreignpolicy.com',
'fortune.com', 'foxnews.com', 'haaretz.com', 'hindustantimes.com',
'huffingtonpost.com', 'huffpost.com', 'independent.co.uk', 'infowars.com',
'latimes.com', 'miamiherald.com', 'motherjones.com', 'msnbc.com',
'nationalreview.com', 'nbcnews.com', 'newsweek.com', 'newyorker.com',
'npr.org', 'nydailynews.com', 'nypost.com', 'nytimes.com', 'pbs.org',
'politico.com', 'propublica.org', 'realclearpolitics.com','reuters.com',
'rollcall.com', 'rt.com', 'salon.com', 'news.sky.com', 'slate.com',
'sputniknews.com', 'theatlantic.com', 'theguardian.com', 'thehill.com',
'time.com', 'usatoday.com', 'vox.com', 'washingtonpost.com',
'washingtontimes.com', 'weeklystandard.com', 'westernjournal.com', 'wsj.com',
'zerohedge.com',
]
non_news_domains = [
'aclu.org', 'change.org', 'cosmopolitan.com', 'facebook.com', 'google.com',
'harvard.edu', 'hbr.org', 'mit.edu', 'patreon.com', 'politifact.com',
'reddit.com', 'reuters.com', 'twitter.com', 'wikileaks.org', 'youtube.com',
]
domains = news_media_domains + non_news_domains
_ = df.loc[news_media_domains,'mean_sharer_ideo'].sort_values(ascending=False).plot.barh(figsize=(10, 20), legend=False)
df.to_csv('media_source_ideologies_all_data.csv')