Updated Mar 23, 2020
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
subdomains = ['huffpost.com', 'nytimes.com', 'foxnews.com', 'breitbart.com', 'cnn.com', 'nypost.com']
def calc_percents(site_sharers_filename, total_sharers_filename, date_field):
df = pd.read_csv(site_sharers_filename,
index_col=['subdomain'],parse_dates=[date_field])
df = df.pivot_table(index=['subdomain', date_field], columns='ideological_pole', values='n_sharers').fillna(0)
totals = pd.read_csv(total_sharers_filename,
index_col=[date_field],parse_dates=[date_field])
totals = totals.pivot_table(index=[date_field], columns='ideological_pole', values='n_sharers')
df = df.join(totals, on=date_field, rsuffix='_total')
df['left_percent'] = df['left'] / df['left_total']
df['right_percent'] = df['right'] / df['right_total']
df['left_percent_over_sum_percents'] = df['left_percent'] / (df['left_percent'] + df['right_percent'])
return df
def calc_ideo_rolling(df, average_over):
return 1-df.groupby(level=0).rolling(average_over, level=1).mean().droplevel(0)['left_percent_over_sum_percents']*2
def calc_ideo_ewm(df, average_over):
return 1-df.groupby(level=0).apply(lambda g: g.ewm(span=average_over).mean().droplevel(0)['left_percent_over_sum_percents'])*2
def calc_ideo_resample(df, average_over):
return 1-df.groupby(level=0).resample(average_over, level=1).mean()['left_percent_over_sum_percents']*2
def calc_ideo_simple(df):
return 1-df['left_percent_over_sum_percents']*2
There are three different parameters that need setting:
Here are four windows we are comparing:
The longer the window, the less effect the very frequent shares have on the score (and the less temporal resolution we have). I'm going to use the equivalent of exponential smoothing over three months for each.
df_d = calc_percents('pole_subdomain_sharers_daily_20160101-20180901_panel.csv',
'pole_total_sharers_daily_20160101-20180901_panel.csv',
'day')
df_w = calc_percents('pole_subdomain_sharers_weekly_20160101-20180901_panel.csv',
'pole_total_sharers_weekly_20160101-20180901_panel.csv',
'week')
df_m = calc_percents('pole_subdomain_sharers_monthly_20160101-20180901_panel.csv',
'pole_total_sharers_monthly_20160101-20180901_panel.csv',
'month')
df_q = calc_percents('pole_subdomain_sharers_quarterly_20160101-20180901_panel.csv',
'pole_total_sharers_quarterly_20160101-20180901_panel.csv',
'quarter')
ideo_d_90_ewm = calc_ideo_ewm(df_d, 90)
ideo_w_12_ewm = calc_ideo_ewm(df_w, 12)
ideo_m_3_ewm = calc_ideo_ewm(df_m, 3)
ideo_q_1_ewm = calc_ideo_ewm(df_q, 1)
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4,sharey=True, sharex=True)
ideo_d_90_ewm.loc[subdomains].unstack(level=0).plot.line(ax=ax1)
ideo_w_12_ewm.loc[subdomains].unstack(level=0).plot.line(ax=ax2, legend=False)
ideo_m_3_ewm.loc[subdomains].unstack(level=0).plot.line(ax=ax3, legend=False)
ideo_q_1_ewm.loc[subdomains].unstack(level=0).plot.line(ax=ax4, figsize=(18, 4), legend=False)
plt.tight_layout()
pd.DataFrame({
'Day window': ideo_d_90_ewm.loc[(subdomains,'2018-08-31')].droplevel(1),
'Week window': ideo_w_12_ewm.loc[(subdomains,'2018-08-27')].droplevel(1),
'Month window': ideo_m_3_ewm.loc[(subdomains,'2018-08-01')].droplevel(1),
'Quarter window': ideo_q_1_ewm.loc[(subdomains,'2018-07-01')].droplevel(1),
}).sort_values('Day window').plot.bar(title='Ideo for Most Recent Period')
We can see in this graph that sites move toward the center. It also suggests that sites whose sharers are more fervent (likely sites on the edges) will move to the center less. I'm looking at the difference between Fox and Breitbart.
These are the three we're comparing:
df_d = calc_percents('pole_subdomain_sharers_daily_20160101-20180901_panel.csv',
'pole_total_sharers_daily_20160101-20180901_panel.csv',
'day')
ideo_d_simple = calc_ideo_simple(df_d)
ideo_d_30_roll = calc_ideo_rolling(df_d, 30)
ideo_d_30_ewm = calc_ideo_ewm(df_d, 30)
ideo_d_30_resamp = calc_ideo_resample(df_d, '30D')
fig, (ax1, ax2) = plt.subplots(2,1,sharex=True)
ideo_d_simple.loc['foxnews.com'].plot.line(title='"foxnews.com"\nNo Averaging', ax=ax1)
averaging = pd.DataFrame({
'Rolling Mean': ideo_d_30_roll.loc['foxnews.com'],
'Rolling Expo-Weighted Mean': ideo_d_30_ewm.loc['foxnews.com'],
'Mean Consecutive Buckets': ideo_d_30_resamp.loc['foxnews.com']
})
averaging['Mean Consecutive Buckets'] = averaging['Mean Consecutive Buckets'].interpolate()
averaging.plot.line(figsize=(12, 3), alpha=0.5, ax=ax2)
I prefer the exponentially-weighted rolling mean to the simple rolling mean as it allows the influence of spikes to decay over time so single events don't get spread out as much. I prefer it to the simple consecutive buckets because it gives us higher temporal resolution, and the consecutive buckets are sensitive to when the weekly breaks occur.
Once we have our number of unique sharers per day/week/month/quarter, we'll likely want to smooth these scores a bit if we want to use them longitunially. As an example, I'll use weekly scores and use an exponentially-weighted rolling mean that decays at four different rates:
ideo_w_2_ewm = calc_ideo_ewm(df_w, 2)
ideo_w_4_ewm = calc_ideo_ewm(df_w, 4)
ideo_w_12_ewm = calc_ideo_ewm(df_w, 12)
ideo_w_32_ewm = calc_ideo_ewm(df_w, 32)
pd.DataFrame({
2: ideo_w_2_ewm.loc['foxnews.com'],
4: ideo_w_4_ewm.loc['foxnews.com'],
12: ideo_w_12_ewm.loc['foxnews.com'],
32: ideo_w_32_ewm.loc['foxnews.com']
}).plot.line(figsize=(12, 3), alpha=0.5, title='Smoothing for "foxnews.com"')
This parameter mostly influences how much and how long a spike affects the ideology.