Updated Jun 26, 2019
How are the discrete ideology scores of media sources changing over time?
%matplotlib inline
import collections, math, sys
import pandas as pd
import tqdm
sys.path.append("/berkman/home/jclark/mc/projects/ideology_from_followers/cleaned_up/data")
from collapsible_domains import COLLAPSIBLE_DOMAINS
MONTHS = [
'2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06', '2016-07',
'2016-08', '2016-09', '2016-10', '2016-11', '2016-12', '2017-01', '2017-02',
'2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09',
'2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03', '2018-04',
'2018-05', '2018-06', '2018-07', '2018-08', #'2018-09', '2018-10', '2018-11', '2018-12'
]
MONTHS_PER_PERIOD = 4
NUM_PERIODS = math.ceil(len(MONTHS) // MONTHS_PER_PERIOD)
We created a panel of accounts in the create_panel notebook. Load them in.
with open('data/panel_accounts.txt') as f:
panel_accounts = set([int(line.strip()) for line in f])
panel_ideos = pd.read_pickle('data/panel_ideos.pkl')
Divide the panel into two groups: left and right of center. We've determined the center to be about 0.444 in a separate notebook.
CENTER = 0.444
panel_groups = pd.DataFrame({
'ideo': panel_ideos,
'group': panel_ideos.apply(lambda i: 'left' if i <= CENTER else 'right')}, index=panel_ideos.index)
_ = panel_groups.groupby('group').count().plot.bar(title='Number of accounts in each ideo group')
This is the total number of accounts in each group, but we're not normalizing by number of accounts, we're normalizing by number of shares within each group. That data changes month to month, so we'll be normalizing differently each month.
Let's look at the shares of each subdomain from our account panel. We have the data monthly, but we're going to collapse it into every four months so the data is less sparse.
periodic_subdomain_group_share_counts = []
for period in tqdm.tqdm_notebook(range(NUM_PERIODS)):
subdomain_to_group_share_counts = collections.defaultdict(collections.Counter)
for month in MONTHS[period * MONTHS_PER_PERIOD:period * MONTHS_PER_PERIOD + MONTHS_PER_PERIOD]:
month_acct_shares = pd.read_pickle(f'../ideology_from_followers/data/historical_work/split_monthly_2/stats/{month}/subdomain_to_users.pkl')
for subdomain, acct_ids in month_acct_shares.items():
#print(subdomain)
panel_shares = set([acct for acct in acct_ids if acct in panel_accounts])
#print(len(panel_shares))
#print(len(set(panel_shares)))
panel_group_counts = collections.Counter(panel_groups.reindex(panel_shares)['group'].values)
#print(panel_group_counts)
subdomain_to_group_share_counts[subdomain] += panel_group_counts
subdomain_to_group_share_counts = pd.DataFrame.from_dict(subdomain_to_group_share_counts, orient='index').fillna(0)
periodic_subdomain_group_share_counts.append(subdomain_to_group_share_counts)
MIN_SHARES = 5
for i in range(len(periodic_subdomain_group_share_counts)):
df = periodic_subdomain_group_share_counts[i]
periodic_subdomain_group_share_counts[i] = df[df.T.sum() >= MIN_SHARES]
Now that we've counted the number of panel members from each group that have shared each subdomain, we need to collapse redundant subdomains and then normalize the number of shares across groups. Then we break the media sites into five groups based on their left-right mixture.
periodic_subdomain_group_share_counts_normed = []
for group_share_counts in tqdm.tqdm_notebook(periodic_subdomain_group_share_counts, smoothing=0):
for collapsible_domain, new_domain in COLLAPSIBLE_DOMAINS.items():
if collapsible_domain in group_share_counts.index:
if new_domain not in group_share_counts.index:
group_share_counts = group_share_counts.append(pd.Series({'left': 0, 'right': 0}, name=new_domain))
group_share_counts.update(group_share_counts.loc[new_domain] + group_share_counts.loc[collapsible_domain])
group_share_counts = group_share_counts.drop(collapsible_domain)
group_sizes = group_share_counts.sum().reindex(['left', 'right'])
size_correction_factor = group_sizes.max() / group_sizes.min()
group_share_counts = group_share_counts.fillna(0).drop('twitter.com')
group_share_counts['right_corrected'] = group_share_counts['right'] * size_correction_factor
group_share_counts['total_shares'] = group_share_counts['left'] + group_share_counts['right_corrected']
group_share_counts['left_to_right_mixture'] = 2 * (group_share_counts['right_corrected'] / group_share_counts['total_shares']) - 1
BREAKPOINTS = [-1.01, -0.6, -0.2, 0.2, 0.6, 1.01]
group_share_counts['ideo_group'] = pd.cut(group_share_counts['left_to_right_mixture'], bins=BREAKPOINTS,
labels=['left', 'center-left', 'center', 'center-right', 'right'])
periodic_subdomain_group_share_counts_normed.append({
'group_sizes': group_sizes,
'size_correction_factor': size_correction_factor,
'subdomain_ideos': group_share_counts
})
How many domains do we have per 4-month period?
ax = pd.Series([period['subdomain_ideos'].shape[0] for period in periodic_subdomain_group_share_counts_normed]).plot.bar(
title='Number of subdomains per 4-month period')
Let's pull out a simple dataset of left-to-right sharing mixture over time for each subdomain.
subdomain_ideo_over_time = {}
for i, period in enumerate(periodic_subdomain_group_share_counts_normed):
start_month = MONTHS[i * MONTHS_PER_PERIOD]
subdomain_ideo_over_time[start_month] = period['subdomain_ideos']['left_to_right_mixture']
subdomain_ideo_over_time = pd.DataFrame.from_dict(subdomain_ideo_over_time, orient='index')
What do the histories of the top 10 subdomains (by total panel shares) look like over time?
sites = periodic_subdomain_group_share_counts_normed[-1]['subdomain_ideos'].sort_values('total_shares', ascending=False).head(10).index.values
_ = subdomain_ideo_over_time.loc[:,sites].plot.line(figsize=(12, 6),
title='Audience Ideo of Top 10 Sites (by # of shares over entire period)')
Observations
foxnews.com is right, nbcnews.com ends up furthest leftthehill.com, politico.com, cnn.com, nbcnews.com, washingtonpost.com move leftyoutube.com moves slightly rightnytimes.com moves left nearing inauguration and then moves back to where it wasThat was the top 10. Let's look at 11-20.
sites = periodic_subdomain_group_share_counts_normed[-1]['subdomain_ideos'].sort_values('total_shares', ascending=False).iloc[10:20].index.values
_ = subdomain_ideo_over_time.loc[:,sites].plot.line(figsize=(12, 6),
title='Audience Ideo of Sites 11-20 (by # of shares in Q3 2018)')
Observations
breitbart.com is right, npr.org and theguardian.com are left, nypost.com is center-right, all stay flatishpscp.tv starts center and moves rightwsj.com stays center-rightLet's see how much these trends hold by looking at what the breakdown of media source ideology groups per 4-month period.
periodic_counts = {}
for i, period in enumerate(periodic_subdomain_group_share_counts_normed):
start_month = MONTHS[i * MONTHS_PER_PERIOD]
subdomain_ideos = period['subdomain_ideos']
periodic_counts[start_month] = subdomain_ideos['ideo_group'].value_counts().reindex(['left', 'center-left', 'center', 'center-right', 'right'])
periodic_counts = pd.DataFrame.from_dict(periodic_counts, orient='index')
display((periodic_counts.T / periodic_counts.T.sum()).T)
ax = (periodic_counts.T / periodic_counts.T.sum()).T.plot.line(
color=['#0d3b6e', '#869db6', '#2a7526', '#d8919e', '#b1243e'],
figsize=(10, 6),
title="Each ideo group's share of media sources per 4-month period")
Observations
left and right generally gain, left more than rightcenter and center-left lose a bunch up to inauguration and then recover somecenter-right gains up to inauguration and then peters out againTODO
subdomain_ideo_over_time.T.to_csv('data/subdomain_discrete_ideo_over_time_by_account_20190626.csv', index_label='subdomain')
for i, period in enumerate(periodic_subdomain_group_share_counts_normed):
start_month = MONTHS[i * MONTHS_PER_PERIOD]
period['subdomain_ideos'].to_csv(f'data/subdomain_discrete_ideo_by_account_details/{start_month}.csv', index_label='subdomain')