Updated Jan 16, 2019
We have ideology estimates for a large number of Twitter accounts now. The problem is that 0.0 represents the mean of the account ideologies, not something about the center of the political spectrum. It would be nice if 0.0 represented something about the world rather than the mean of the accounts we happen to have. The way we're going to get an estimate of that is looking at how accounts describe themselves, and compare that to their estimated ideologies to find center. We're going to follow Barbera "Birds of the Same Feather".
The general outline is:
%matplotlib inline
import gzip, pickle, collections, itertools, random, json, re
import pandas as pd
import plotly.offline as plotly
import plotly.graph_objs as go
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbs
from scipy import stats, optimize
plotly.init_notebook_mode()
This is the command I used on my home machine to pull out user descriptions that included political keywords.
filter_terms = ['conservative', 'gop', 'republican', 'liberal', 'progressive', 'democrat', 'moderate', 'independent']
jq_filter = r'(contains("{}"))'.format(r'") or contains("'.join(filter_terms))
jq_filter = r'test("({})"; "i")'.format('|'.join(filter_terms))
print(f"jq -c 'select(.description | {jq_filter}) | [.id_str, .description]' data/users_with_ideo_estimates.ndjson | gzip > user_descs_with_political_keywords.ndjson.gz")
acct_descs = {}
with gzip.open('data/user_descs_with_political_keywords.ndjson.gz', 'r') as f:
for line in f:
aid, desc = json.loads(line)
acct_descs[int(aid)] = desc.strip()
acct_descs = pd.DataFrame.from_dict(acct_descs, orient='index', columns=['description'])
Here we assign an ideology to a user based on the first use of an ideological keyword in their description.
ideo_group_terms = {
'conservative': 'right',
'gop': 'right',
'republican': 'right',
'liberal': 'left',
'progressive': 'left',
'democrat': 'left',
'moderate': 'center',
'independent': 'center'
}
pattern = re.compile(f"({'|'.join(ideo_group_terms.keys())})", re.I)
def desc_to_ideo_group(desc):
match = re.search(pattern, desc)
try:
return ideo_group_terms[match.group(1).lower()]
except KeyError:
raise ValueError(f'"{desc}" did not contain a keyword')
acct_descs['ideo_group'] = acct_descs['description'].apply(desc_to_ideo_group)
acct_ideos = pd.read_csv('data/cleaned_user_ideology_estimates_20180705.csv.gz', index_col=0)
accts = acct_ideos.join(acct_descs, how='inner')
accts.sample(5)
accts['ideo_group'].value_counts()
ideo_range = (accts['normed_theta'].min(), accts['normed_theta'].max())
ax = accts[accts['ideo_group'] == 'center']['normed_theta'].plot.density(alpha=0.8, color='g', xlim=ideo_range)
ax = accts[accts['ideo_group'] == 'right']['normed_theta'].plot.density(alpha=0.8, ax=ax, color='r')
_ = accts[accts['ideo_group'] == 'left']['normed_theta'].plot.density(alpha=0.8, ax=ax, color='b')
It looks like they actually cross, which is good. Let's actually get an estimate of where they cross relative to our ideology estimates.
num_bins = 40
l_hist, l_edges = np.histogram(accts.loc[accts['ideo_group'] == 'left','normed_theta'], bins=num_bins, density=True, range=ideo_range)
r_hist, r_edges = np.histogram(accts.loc[accts['ideo_group'] == 'right','normed_theta'], bins=num_bins, density=True, range=ideo_range)
hists = pd.DataFrame({'left': l_hist, 'right': r_hist, 'bin_ideo_center': pd.Series(l_edges).rolling(2).mean().round(3)[1:]})
_ = hists.plot.bar(x='bin_ideo_center', figsize=(16, 6))
l_kernel = stats.gaussian_kde(accts.loc[accts['ideo_group'] == 'left', 'normed_theta'])
r_kernel = stats.gaussian_kde(accts.loc[accts['ideo_group'] == 'right', 'normed_theta'])
fig = plt.figure()
ax = fig.add_subplot(111)
x_eval = np.linspace(ideo_range[0], ideo_range[1], num=500)
ax.plot(x_eval, l_kernel(x_eval), 'b-')
_ = ax.plot(x_eval, r_kernel(x_eval), 'r-')
print('Left minimum:', optimize.minimize_scalar(l_kernel.evaluate, bounds=(-1, 2), method='bounded').x[0])
print('Right minimum:', optimize.minimize_scalar(r_kernel.evaluate, bounds=(-1, 2), method='bounded').x[0])
center = optimize.minimize_scalar(lambda f: np.abs(l_kernel.evaluate(f) - r_kernel.evaluate(f)), bounds=(-1, 2), method='bounded').x[0]
print(f'Where left and right cross: {center}')
The center is about 0.395
Let's look at how that center relates to our media source ideology scores. They're the same scale, so the center might be meaningful if it's simply lifted and applyed to the media.
df = pd.read_csv('media_source_ideologies_all_data.csv', index_col=0)
df['diff_from_center'] = (np.abs(df['mean_sharer_ideo'] - center))
interesting_cols = ['mean_sharer_ideo', 'stddev_sharer_ideo', 'num_sharers', 'num_uniq_urls', 'diff_from_center']
interesting_domains = ['english.alarabiya.net', 'aljazeera.com', 'americanthinker.com', 'bbc.com',
'bbc.co.uk', 'bloomberg.com', 'bostonglobe.com', 'breitbart.com',
'buzzfeed.com', 'cbc.ca', 'cbsnews.com', 'chicagotribune.com', 'cnbc.com',
'cnn.com', 'csmonitor.com', 'dailycaller.com', 'dailykos.com',
'dailymail.co.uk', 'economist.com', 'forbes.com', 'foreignpolicy.com',
'fortune.com', 'foxnews.com', 'haaretz.com', 'hindustantimes.com',
'huffingtonpost.com', 'huffpost.com', 'independent.co.uk', 'infowars.com',
'latimes.com', 'miamiherald.com', 'motherjones.com', 'msnbc.com',
'nationalreview.com', 'nbcnews.com', 'newsweek.com', 'newyorker.com',
'npr.org', 'nydailynews.com', 'nypost.com', 'nytimes.com', 'pbs.org',
'politico.com', 'propublica.org', 'realclearpolitics.com','reuters.com',
'rollcall.com', 'rt.com', 'salon.com', 'news.sky.com', 'slate.com',
'sputniknews.com', 'theatlantic.com', 'theguardian.com', 'thehill.com',
'time.com', 'usatoday.com', 'vox.com', 'washingtonpost.com',
'washingtontimes.com', 'weeklystandard.com', 'westernjournal.com', 'wsj.com',
'zerohedge.com']
display(df.where(df['num_sharers'] > 1000).sort_values('diff_from_center').loc[:,interesting_cols].head(10))
_ = (df.loc[interesting_domains, interesting_cols]['mean_sharer_ideo'] - center).sort_values(ascending=False).plot.barh(figsize=(12, 20))
Observations