One of the problems with the user ideology estimates is that once we come up with estimates for media sources from that, it's hard to know where to break the distribution up into quintiles. One thing Rob wanted to do to remedy that is look at users that follow some ratio of Rs and Ds to create a two-pole system. For example, all users that follow >=75% Democrats make up our blue pole. For any given media source, we look at what percentage of each pole shared it.
This doesn't seem great because it's throwing away a lot of data about people in the middle. If the problem is where to draw lines in the distribution, going back this far and throwing away data about the center doesn't seem like a good approach. We could have media sources that are shared entirely by people in the center-right (like the Spanish language news sites). Those folks can be center right in two ways: either they exclusively follow the few center-right politicians that exist, or they follow some mix of left and right in a 1:2 ratio or something. If they follow center-right, they are now in the R camp along with the alt-right and everyone else. If they follow a mix, they might fall below our threshold and get excluded entirely.
%matplotlib inline
import gzip, pickle, collections, itertools, random, os, glob, re
import pandas as pd
import plotly.offline as plotly
import plotly.graph_objs as go
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbs
import scipy.io, tqdm
plotly.init_notebook_mode()
This section just gets all the data into an easy-to-use format. Skip down to the next section for results.
data_filename = 'data/follower_to_elites_dict.pkl'
if os.path.isfile(data_filename):
follower_to_elites = pd.read_pickle(data_filename)
else:
follower_to_elites = collections.defaultdict(list)
for follower_list_file in tqdm.tqdm(glob.glob('data/follower_lists_20180707/*.txt'), smoothing=0):
elite_acct = re.match(r'data/follower_lists_20180707/(.*)\.txt', follower_list_file).groups()[0]
with open(follower_list_file) as f:
for line in f:
follower = int(line.strip())
follower_to_elites[follower].append(elite_acct)
pd.to_pickle(follower_to_elites, data_filename)
followers = list(follower_to_elites.keys())
print(len(followers))
for acct in tqdm.tqdm(followers):
if len(follower_to_elites[acct]) < 3:
del follower_to_elites[acct]
print(len(follower_to_elites))
elite_accts = []
for follower_list_file in tqdm.tqdm(glob.glob('data/follower_lists_20180707/*.txt'), smoothing=0):
elite_accts.append(re.match(r'data/follower_lists_20180707/(.*)\.txt', follower_list_file).groups()[0])
data_filename = 'data/elite_to_followers_dict.pkl'
if os.path.isfile(data_filename):
elite_to_followers = pd.read_pickle(data_filename)
else:
elite_to_followers = collections.defaultdict(list)
for follower_list_file in tqdm.tqdm(glob.glob('data/follower_lists_20180707/*.txt'), smoothing=0):
elite_acct = re.match(r'data/follower_lists_20180707/(.*)\.txt', follower_list_file).groups()[0]
with open(follower_list_file) as f:
for line in f:
elite_to_followers[elite_acct].append(int(line.strip()))
pd.to_pickle(elite_to_followers, data_filename)
pol_data = pd.read_csv('data/politician_data-20180705.csv', index_col=1)
pol_data.sample(5)
pol_data_by_sn = pol_data.set_index('screen_name')
elite_acct_to_party = {acct: pol_data_by_sn.at[acct, 'party'] for acct in elite_accts if acct in pol_data_by_sn.index}
follower_to_party_counts = {}
for acct, elites in tqdm.tqdm(follower_to_elites.items()):
follower_to_party_counts[acct] = collections.Counter(map(lambda a: elite_acct_to_party.get(a, None), elites))
pd.to_pickle(follower_to_party_counts, 'data/follower_to_elite_party_counts.pkl')
follower_to_party_counts = pd.read_pickle('data/follower_to_elite_party_counts.pkl')
parties = ['Democrat', 'Independent', None, 'Republican']
parties = set()
for party_counts in follower_to_party_counts.values():
parties.update(party_counts.keys())
parties
top = list(follower_to_party_counts.keys())[0:10]
df = pd.DataFrame.from_dict(follower_to_party_counts, orient='index')
df
df.to_pickle('data/follower_to_elite_party_counts_df.pkl')
Here I'm going to look at the party loyalists and how those loyalists can be used to compute ideologies for sites.
Sample of account data
accts = pd.read_pickle('data/follower_to_elite_party_counts_df.pkl').fillna(0)
accts.sample(5)
accts = accts.drop([None, 'Independent'], axis=1)
accts['Sum'] = accts.T.sum()
MIN_FOLLOWED = 3
accts = accts[accts['Sum'] >= MIN_FOLLOWED]
accts['D/Sum'] = accts['Democrat'] / accts['Sum']
accts.sample(10)
_ = accts['D/Sum'].plot.hist(bins=30)
R_LOYALIST_THRESHOLD = 0.2
D_LOYALIST_THRESHOLD = 0.8
accts['R_Follower'] = (accts['D/Sum'] <= R_LOYALIST_THRESHOLD)
accts['D_Follower'] = (accts['D/Sum'] >= D_LOYALIST_THRESHOLD)
accts.loc[accts['R_Follower'], 'Party_Followed'] = 'R'
accts.loc[accts['D_Follower'], 'Party_Followed'] = 'D'
accts.sample(10)
accts['Party_Followed'].value_counts()
Observations
subdomain_to_users = pd.read_pickle('data/all_samples_combined/subdomain_to_users.pkl')
if os.path.isfile('data/subdomain_to_party_followers.pkl'):
subdomain_to_party_followers = pd.read_pickle('data/subdomain_to_party_followers.pkl')
else:
subdomain_to_party_followers = {
s: collections.Counter([accts.at[sharer, 'Party'] for sharer in sharers if sharer in accts.index])
for s, sharers in subdomain_to_users.items()}
pd.to_pickle(subdomain_to_party_followers, 'data/subdomain_to_party_followers.pkl')
MIN_USERS = 5
subdomains_with_data = [s for s, u in subdomain_to_party_followers.items() if len(u) > MIN_USERS]
len(subdomains_with_data)
site_ideos = pd.DataFrame.from_dict({k:
{p:subdomain_to_party_followers[k][p] for p in ['R', 'D']}
for k in subdomains_with_data}, orient='index')
site_ideos.sample(5)
site_ideos = site_ideos.join(site_ideos / accts['Party_Followed'].value_counts(), rsuffix='_pct')
site_ideos['R+D'] = site_ideos['R'] + site_ideos['D']
site_ideos['R_pct+D_pct'] = site_ideos['R_pct'] + site_ideos['D_pct']
site_ideos['D/R+D'] = site_ideos['D'] / site_ideos['R+D']
site_ideos['D_pct/R_pct+D_pct'] = site_ideos['D_pct'] / site_ideos['R_pct+D_pct']
_ = site_ideos['D/R+D'].plot.hist(bins=300)
_ = site_ideos['D_pct/R_pct+D_pct'].plot.hist(bins=300)
BUCKET_BREAKS = [0, 0.2, 0.4, 0.6, 0.8, 1]
BUCKET_LABELS = ['right', 'center-right', 'center', 'center-left', 'left']
site_ideos['ideo_group_by_count'] = pd.cut(site_ideos['D/R+D'], breaks, labels=BUCKET_LABELS)
site_ideos['ideo_group_by_pct'] = pd.cut(site_ideos['D_pct/R_pct+D_pct'], breaks, labels=BUCKET_LABELS)
site_ideos.sample(5)
ax = site_ideos['ideo_group_by_count'].value_counts().reindex(reversed(BUCKET_LABELS))\
.plot.bar(color=['#0d3b6e', '#869db6', '#2a7526', '#d8919e', '#b1243e'])
_ = ax.set_title('Site Ideology by Party Follower Shares (Raw Counts)')
ax = site_ideos['ideo_group_by_pct'].value_counts().reindex(reversed(BUCKET_LABELS))\
.plot.bar(color=['#0d3b6e', '#869db6', '#2a7526', '#d8919e', '#b1243e'])
_ = ax.set_title('Site Ideology by Party Follower Shares (%)')
site_ideos.groupby('ideo_group_by_count').apply(lambda g: g.sort_values('R+D', ascending=False).head(10))
site_ideos.groupby('ideo_group_by_pct').apply(lambda g: g.sort_values('R+D', ascending=False).head(10))
site_ideos.to_csv('data/media_source_ideo_with_party_pole_buckets.csv')