Updated Feb 11, 2019
We've aggregated in a bunch of ways in the past: account-level, story-level, domain-level. What happens if we don't do any of that? Let's just look at the ratio of shares from the left and the right. We're still using the Barbera account ideology estimates and the center we estimated (0.395), but every share comes from either the left or the right of that center. First we'll just look at raw shares, and then we'll look at the percentage of total shares from each side, which will give the left and the right equal weight.
I'm only going to look at shares from accounts that share the given domain 5000 times or fewer.
%matplotlib inline
import collections
import ujson, tqdm
import pandas as pd
import numpy as np
CENTER = 0.395
all_acct_ideos = pd.read_csv('data/cleaned_user_ideology_estimates_20180705.csv.gz', index_col=0)
acct_ids_in_sample = pd.read_pickle('data/all_samples_combined/user_ids.pkl')
all_acct_ideos['ideo'] = all_acct_ideos['normed_theta'] - CENTER
acct_ideos = all_acct_ideos.reindex(acct_ids_in_sample).dropna()
acct_ideos['pole'] = acct_ideos.apply(lambda r: 'left' if r['ideo'] < 0 else 'right', axis=1)
acct_to_ideo = dict(acct_ideos['ideo'].items())
acct_to_pole = dict(acct_ideos['pole'].items())
_ = acct_ideos['ideo'].plot.hist(bins=300)
_ = acct_ideos['pole'].value_counts().plot.bar()
Let's look at the ratio of left/right shares, removing all users that shared a single domain more than 5000 times.
subdomain_to_pole_shares = collections.defaultdict(collections.Counter)
subdomain_to_acct_shares = pd.read_pickle('data/all_samples_combined/subdomain_to_user_shares.pkl')
subdomain_to_num_acct_shares = {}
MAX_SHARES_FROM_SINGLE_ACCOUNT = 5000
for subdomain, acct_shares in tqdm.tqdm(subdomain_to_acct_shares.items()):
subdomain_to_num_acct_shares[subdomain] = collections.Counter(acct_shares)
for acct in acct_shares:
if subdomain_to_num_acct_shares[subdomain][acct] > MAX_SHARES_FROM_SINGLE_ACCOUNT:
continue
try:
pole = acct_to_pole[acct]
subdomain_to_pole_shares[subdomain][pole] += 1
except KeyError:
continue
raw = pd.DataFrame.from_dict(subdomain_to_pole_shares, orient='index').fillna(0)
raw = raw.rename({'left': 'left_shares', 'right': 'right_shares'}, axis=1)
raw['total_shares'] = raw['left_shares'] + raw['right_shares']
raw['pct_of_shares_from_left'] = raw['left_shares'] / raw['total_shares']
raw.sample(5)
_ = raw['pct_of_shares_from_left'].plot.hist(bins=200)
news_media_domains = [
'english.alarabiya.net', 'aljazeera.com', 'americanthinker.com', 'bbc.com',
'bbc.co.uk', 'bloomberg.com', 'bostonglobe.com', 'breitbart.com',
'buzzfeed.com', 'cbc.ca', 'cbsnews.com', 'chicagotribune.com', 'cnbc.com',
'cnn.com', 'csmonitor.com', 'dailycaller.com', 'dailykos.com',
'dailymail.co.uk', 'economist.com', 'forbes.com', 'foreignpolicy.com',
'fortune.com', 'insider.foxnews.com', 'nation.foxnews.com', 'foxnews.com', 'haaretz.com', 'hindustantimes.com',
'huffingtonpost.com', 'huffpost.com', 'independent.co.uk', 'infowars.com',
'latimes.com', 'miamiherald.com', 'motherjones.com', 'msnbc.com',
'nationalreview.com', 'nbcnews.com', 'newsweek.com', 'newyorker.com',
'npr.org', 'nydailynews.com', 'nypost.com', 'nytimes.com', 'pbs.org',
'politico.com', 'propublica.org', 'realclearpolitics.com','reuters.com',
'rollcall.com', 'rt.com', 'salon.com', 'news.sky.com', 'slate.com',
'sputniknews.com', 'theatlantic.com', 'theguardian.com', 'thehill.com',
'time.com', 'usatoday.com', 'vox.com', 'washingtonpost.com',
'washingtontimes.com', 'weeklystandard.com', 'westernjournal.com', 'wsj.com',
'zerohedge.com',
]
non_news_domains = [
'aclu.org', 'change.org', 'cosmopolitan.com', 'facebook.com', 'google.com',
'harvard.edu', 'hbr.org', 'mit.edu', 'patreon.com', 'politifact.com',
'reddit.com', 'reuters.com', 'twitter.com', 'wikileaks.org', 'youtube.com',
]
#domains = news_media_domains + non_news_domains
_ = (-1 * raw.loc[news_media_domains, 'pct_of_shares_from_left'] + 0.5).sort_values(ascending=False).plot.barh(figsize=(10, 20), legend=False)
Observations
Let's look at the same ratio of left/right shares, but divide each side by the total shares on that side.
num_left_shares = raw['left_shares'].sum()
num_right_shares = raw['right_shares'].sum()
num_total_shares = num_left_shares + num_right_shares
raw['pct_of_left_shares'] = raw['left_shares'] / num_left_shares
raw['pct_of_right_shares'] = raw['right_shares'] / num_right_shares
raw['pct_of_shares_from_left_pct'] = raw['pct_of_left_shares'] / (raw['pct_of_left_shares'] + raw['pct_of_right_shares'])
raw.sample(5)
_ = (-1 * raw.loc[news_media_domains, 'pct_of_shares_from_left_pct'] + 0.5).sort_values(ascending=False).plot.barh(figsize=(10, 20), legend=False)
Observations
raw.loc['news.sky.com']
raw.loc['rt.com']
raw.to_csv('data/all_samples_combined/subdomain_ideo_est_just_shares.csv')