Estimate Ideologies of Media Sources using All Data

Updated Jan 11, 2019

In this we combine the data from our 3 samples into one big dataset and compute the mean ideology of sharers for every subdomain. The final dataset ended up consisting of 63,234,726 tweets from 46,245 accounts. If we only consider domains that were shared by 30 or more accounts, we have ideology estimates for 15,459 domains. The estimates are here.

In [1]:
%matplotlib inline
import gzip, pickle, collections, itertools, random

import pandas as pd
import plotly.offline as plotly
import plotly.graph_objs as go
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbs

plotly.init_notebook_mode()

Let's grab all the user data and bin the users. To bin users, first, we pick a center. Then we break up the two sides into 10 bins of equal numbers of people.

In [2]:
user_ideos = pd.read_csv('data/cleaned_user_ideology_estimates_20180705.csv.gz', index_col=0)
uid_to_ideo = dict(user_ideos['normed_theta'].items())

CENTER = 0.2

users = pd.read_pickle('data/all_samples_combined/user_ids.pkl')
left_user_bins, left_bins = pd.qcut(user_ideos[user_ideos['normed_theta'] < CENTER]['normed_theta'], 10, retbins=True, labels=[i for i in range(-10, 0)])
right_user_bins, right_bins = pd.qcut(user_ideos[user_ideos['normed_theta'] > CENTER]['normed_theta'], 10, retbins=True, labels=[i for i in range(1, 11)])
user_bins = left_user_bins.append(right_user_bins)
user_to_bin = user_bins.to_dict()
ax = user_ideos['normed_theta'].plot.hist(bins=200, alpha=0.5, figsize=(12, 6))
ax.set_title('User ideology distribution with bins')
ax.vlines(CENTER, 0, 35000, color='red')
ax.vlines(left_bins, 0, 35000, linestyles='dotted', alpha=0.5)
_ = ax.vlines(right_bins, 0, 35000, linestyles='dotted', alpha=0.5)
/berkman/home/jclark/miniconda3/lib/python3.7/site-packages/numpy/lib/arraysetops.py:522: FutureWarning:

elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison

In [3]:
print('Bin edges:')
all_bins = list(left_bins)
all_bins.extend(list(right_bins)[1:])
all_bins
Bin edges:
Out[3]:
[-1.3816918087224237,
 -1.0153047228583716,
 -0.9145616215814966,
 -0.8224706158213905,
 -0.7342970224905225,
 -0.647712502111146,
 -0.5561871925652249,
 -0.4490320560582582,
 -0.3076680131067228,
 -0.10527498585030008,
 0.19999871184383752,
 0.38325561234199707,
 0.5941109874113187,
 0.8236586628482395,
 1.0611243204725533,
 1.2925428959837295,
 1.5183515823916451,
 1.7322828781184447,
 1.94262497187348,
 2.1812914511595096,
 3.522789097671492]
In [4]:
print('Users per bin:')
user_bins.value_counts().sort_index()
Users per bin:
Out[4]:
-10    115780
-9     115780
-8     115780
-7     115780
-6     115780
-5     115779
-4     115780
-3     115780
-2     115780
-1     115780
 1      54187
 2      54187
 3      54187
 4      54187
 5      54187
 6      54187
 7      54187
 8      54187
 9      54187
 10     54187
Name: normed_theta, dtype: int64

Now we compute the mean ideology for each subdomain.

In [5]:
%%time

MIN_USERS = 30

s1 = pd.read_pickle('data/all_samples_combined/subdomain_to_users.pkl')
subdomain_to_users = {k: v for k, v in s1.items() if len(v) >= MIN_USERS}
del s1
CPU times: user 5.4 s, sys: 345 ms, total: 5.75 s
Wall time: 5.02 s
In [6]:
%%time

max_length = max(len(v) for k,v in subdomain_to_users.items())
s1 = pd.DataFrame.from_dict({k:[uid_to_ideo.get(u, None) for u in v] + ([np.nan] * (max_length - len(v))) for k,v in subdomain_to_users.items()}, dtype='float16')

subdomain_to_num_urls = pd.Series({k: len(v) for k,v in pd.read_pickle('data/all_samples_combined/subdomain_to_urls.pkl').items()})
subdomain_to_user_bins = {subdomain: [user_to_bin.get(uid, None) for uid in users] for subdomain, users in subdomain_to_users.items()}
CPU times: user 4min 56s, sys: 6.05 s, total: 5min 2s
Wall time: 5min 2s
In [7]:
%%time

data_dict = {
    'mean_sharer_ideo': s1.mean(), 
    'stddev_sharer_ideo': s1.std(),
    'num_sharers': s1.count(),
    'num_uniq_urls': subdomain_to_num_urls,
}

for i in user_bins.value_counts().sort_index().index:
    data_dict[f'num_sharers_in_ideo_bin_{i}'] = {s: len([b for b in subdomain_to_user_bins[s] if b == i]) for s in subdomain_to_user_bins.keys()}
CPU times: user 2min 1s, sys: 3.8 s, total: 2min 5s
Wall time: 1min 56s
In [8]:
df = pd.DataFrame(data_dict, index=s1.columns)
df.describe()
Out[8]:
mean_sharer_ideo stddev_sharer_ideo num_sharers num_uniq_urls num_sharers_in_ideo_bin_-10 num_sharers_in_ideo_bin_-9 num_sharers_in_ideo_bin_-8 num_sharers_in_ideo_bin_-7 num_sharers_in_ideo_bin_-6 num_sharers_in_ideo_bin_-5 ... num_sharers_in_ideo_bin_1 num_sharers_in_ideo_bin_2 num_sharers_in_ideo_bin_3 num_sharers_in_ideo_bin_4 num_sharers_in_ideo_bin_5 num_sharers_in_ideo_bin_6 num_sharers_in_ideo_bin_7 num_sharers_in_ideo_bin_8 num_sharers_in_ideo_bin_9 num_sharers_in_ideo_bin_10
count 15459.000000 15459.000000 15459.000000 1.545900e+04 15459.000000 15459.000000 15459.000000 15459.000000 15459.000000 15459.000000 ... 15459.000000 15459.000000 15459.000000 15459.000000 15459.00000 15459.000000 15459.000000 15459.000000 15459.000000 15459.000000
mean 0.249023 0.856445 253.531212 1.345549e+03 20.773918 21.253768 19.555534 18.540785 16.489100 14.479915 ... 5.481791 5.685167 5.297885 5.324212 6.09826 6.846174 8.138431 12.321237 16.474740 19.821204
std 0.814453 0.289062 839.387820 7.720443e+04 82.099345 80.976349 73.656549 63.371528 56.732253 49.309763 ... 22.114663 21.341445 21.846166 20.714103 22.73093 25.766674 28.966133 40.303653 51.225334 61.058262
min -0.806152 0.196899 13.000000 1.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
25% -0.402588 0.612305 39.000000 2.700000e+01 1.000000 1.000000 1.000000 2.000000 1.000000 1.000000 ... 1.000000 1.000000 0.000000 0.000000 0.00000 0.000000 0.000000 1.000000 0.000000 1.000000
50% 0.012093 0.882324 67.000000 7.100000e+01 4.000000 4.000000 4.000000 5.000000 4.000000 4.000000 ... 2.000000 2.000000 1.000000 2.000000 2.00000 2.000000 2.000000 3.000000 3.000000 3.000000
75% 0.674561 1.103516 165.000000 2.225000e+02 11.000000 12.000000 11.000000 12.000000 10.000000 9.000000 ... 4.000000 4.000000 4.000000 4.000000 4.00000 4.000000 5.000000 8.000000 10.000000 12.000000
max 2.164062 1.540039 29515.000000 9.486346e+06 2464.000000 2368.000000 2234.000000 1906.000000 1756.000000 1619.000000 ... 1011.000000 949.000000 1006.000000 956.000000 917.00000 1023.000000 986.000000 1078.000000 1094.000000 1244.000000

8 rows × 24 columns

In [9]:
_ = df['mean_sharer_ideo'].plot.hist(bins=200)
In [10]:
news_media_domains = [
    'english.alarabiya.net', 'aljazeera.com', 'americanthinker.com', 'bbc.com',
    'bbc.co.uk', 'bloomberg.com', 'bostonglobe.com', 'breitbart.com',
    'buzzfeed.com', 'cbc.ca', 'cbsnews.com', 'chicagotribune.com', 'cnbc.com',
    'cnn.com', 'csmonitor.com', 'dailycaller.com', 'dailykos.com',
    'dailymail.co.uk', 'economist.com', 'forbes.com', 'foreignpolicy.com',
    'fortune.com', 'foxnews.com', 'haaretz.com', 'hindustantimes.com',
    'huffingtonpost.com', 'huffpost.com', 'independent.co.uk', 'infowars.com',
    'latimes.com', 'miamiherald.com', 'motherjones.com', 'msnbc.com',
    'nationalreview.com', 'nbcnews.com', 'newsweek.com', 'newyorker.com',
    'npr.org', 'nydailynews.com', 'nypost.com', 'nytimes.com', 'pbs.org',
    'politico.com', 'propublica.org', 'realclearpolitics.com','reuters.com',
    'rollcall.com', 'rt.com', 'salon.com', 'news.sky.com', 'slate.com',
    'sputniknews.com', 'theatlantic.com', 'theguardian.com', 'thehill.com',
    'time.com', 'usatoday.com', 'vox.com', 'washingtonpost.com',
    'washingtontimes.com', 'weeklystandard.com', 'westernjournal.com', 'wsj.com',
    'zerohedge.com',
]
non_news_domains = [
    'aclu.org', 'change.org', 'cosmopolitan.com', 'facebook.com', 'google.com',
    'harvard.edu', 'hbr.org', 'mit.edu', 'patreon.com', 'politifact.com',
    'reddit.com', 'reuters.com', 'twitter.com', 'wikileaks.org', 'youtube.com',
]
domains = news_media_domains + non_news_domains
_ = df.loc[news_media_domains,'mean_sharer_ideo'].sort_values(ascending=False).plot.barh(figsize=(10, 20), legend=False)
In [11]:
df.to_csv('media_source_ideologies_all_data.csv')