Updated June 13, 2019
We have collected tweets for our sampled set of Twitter users from 2016 through 2018. We've split out their domain shares by month. Now we're looking at how those sharing patterns have changed over time.
%matplotlib inline
import datetime, sys, csv, collections, math
import tqdm, ujson
import pandas as pd
import numpy as np
sys.path.append("/berkman/home/jclark/mc/tools/twitter")
import utils
sys.path.append("/berkman/home/jclark/mc/projects/ideology_from_followers/cleaned_up/data")
from collapsible_domains import COLLAPSIBLE_DOMAINS
MONTHS = [
'2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06', '2016-07',
'2016-08', '2016-09', '2016-10', '2016-11', '2016-12', '2017-01', '2017-02',
'2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09',
'2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03', '2018-04',
'2018-05', '2018-06', '2018-07', '2018-08', #'2018-09', '2018-10', '2018-11', '2018-12'
]
MONTHS_PER_PERIOD = 4
NUM_PERIODS = math.ceil(len(MONTHS) // MONTHS_PER_PERIOD)
We created a panel of accounts in the create_panel notebook. Load them in.
with open('data/panel_accounts.txt') as f:
panel_accounts = set([int(line.strip()) for line in f])
panel_ideos = pd.read_pickle('data/panel_ideos.pkl')
Now that we have a panel of accounts, we need to calculate ideology scores of media sources for each time period. We originally did this by month, but the data was a little sparse, so we changed it to every 4 months because our overall time period (32 months) was divisible in this way.
MIN_SHARING_ACCTS = 3
periodic_url_sharer_ideos = {}
for period in tqdm.tqdm_notebook(range(NUM_PERIODS), smoothing=0):
start_month = MONTHS[period * MONTHS_PER_PERIOD]
url_to_accts = collections.defaultdict(set)
url_to_domain = {}
url_to_subdomain = {}
for month in MONTHS[period * MONTHS_PER_PERIOD:period * MONTHS_PER_PERIOD + MONTHS_PER_PERIOD]:
month_url_to_accts = pd.read_pickle(f'../ideology_from_followers/data/historical_work/split_monthly/stats/{month}/url_to_users.pkl')
url_to_domain.update(pd.read_pickle(f'../ideology_from_followers/data/historical_work/split_monthly/stats/{month}/url_to_domain.pkl'))
url_to_subdomain.update(pd.read_pickle(f'../ideology_from_followers/data/historical_work/split_monthly/stats/{month}/url_to_subdomain.pkl'))
for url, acct_ids in month_url_to_accts.items():
if url.startswith('https://twitter.com'): continue
panel_shares = (set(acct_ids) & panel_accounts)
url_to_accts[url] |= panel_shares
for url in list(url_to_accts.keys()):
num_accts = len(url_to_accts[url])
if num_accts < MIN_SHARING_ACCTS:
del url_to_accts[url]
max_length = max(len(accts) for accts in url_to_accts.values())
url_to_acct_ideos = pd.DataFrame.from_dict({
url: [panel_ideos[acct] for acct in accts] + ([np.nan] * (max_length - len(accts)))
for url, accts in url_to_accts.items()}, dtype='float32')
url_sharer_ideo_summary = pd.DataFrame({
'sharers_ideo_mean': url_to_acct_ideos.mean(),
'sharer_ideo_stddev': url_to_acct_ideos.std(),
'num_sharers': url_to_acct_ideos.count(),
'subdomain': url_to_subdomain,
'domain': url_to_domain,
}, index=url_to_acct_ideos.columns)
periodic_url_sharer_ideos[start_month] = url_sharer_ideo_summary
#display(url_sharer_ideo_summary.sort_values('num_sharers', ascending=False).head(5))
We calculated ideology means for every URL. Fold those up into subdomains.
MIN_NUM_URLS = 5
periodic_subdomain_sharer_ideo_summary = {}
for start_month, url_sharer_ideos in tqdm.tqdm_notebook(periodic_url_sharer_ideos.items(), smoothing=0):
for collapsible_domain, new_domain in COLLAPSIBLE_DOMAINS.items():
url_sharer_ideos.loc[url_sharer_ideos['subdomain'] == collapsible_domain, 'subdomain'] = new_domain
periodic_subdomain_sharer_ideo_summary[start_month] = pd.DataFrame({
'url_ideo_mean': url_sharer_ideos.groupby('subdomain')['sharers_ideo_mean'].mean(),
'url_ideo_stddev': url_sharer_ideos.groupby('subdomain')['sharers_ideo_mean'].std(),
'url_count': url_sharer_ideos.groupby('subdomain').count()['domain'],
}).query(f'url_count >= {MIN_NUM_URLS}')
First, let's look at how the ideology of the most popular sites has changed.
subdomain_ideo_over_time = pd.DataFrame.from_dict({start_month: df['url_ideo_mean'] for start_month, df
in periodic_subdomain_sharer_ideo_summary.items()}, orient='index')
sites = periodic_url_sharer_ideos['2018-05'].groupby('subdomain').sum().sort_values('num_sharers', ascending=False).head(10).index.values
ax = subdomain_ideo_over_time.loc[:,sites].plot.line(figsize=(12, 6))
_ = ax.set_title('Audience Ideo of Top 10 Sites (by # of sharers in 2018 Q3)')
Observations
That was the top 10. Let's see what 11-20 look like:
sites = periodic_url_sharer_ideos['2018-05'].groupby('subdomain').sum().sort_values('num_sharers', ascending=False).iloc[10:20].index.values
ax = subdomain_ideo_over_time.loc[:,sites].plot.line(figsize=(12, 6))
_ = ax.set_title('Audience Ideo of Sites 11-20 (by # of sharers in 2018 Q3)')
Observations
wsj.com and youtube.com that don't move much.rawstory.com and shareblue.com.Let's see what the overall histograms look like.
import joypy
fig, axes = joypy.joyplot(subdomain_ideo_over_time.T, overlap=0, figsize=(5, 10),
x_range=(-1, 2.5), alpha=1, linewidth=0.5, hist=True, grid=True, bins=50, density=True)
_ = axes[0].set_title('Media Source Ideology Histograms over Time')
Observations
Let's look at the sites with the highest standard deviation. The idea here being that these are the sites that will have changed the most over time.
sites = subdomain_ideo_over_time.T[subdomain_ideo_over_time.count() > 6].T.std().sort_values(ascending=False).head(10).index.values
_ = subdomain_ideo_over_time.loc[:,sites].plot.line(figsize=(6, 15), subplots=True, sharey=True,
title='Ideo over time of top 10 by standard deviation')
Observations
whitehouse.gov, petitions.whitehouse.gov and obamawhitehouse.archives.gov are here (well, kinda) and doing what we'd expect.11alive.com is Atlanta local news WXIA-TV and chicago.cbslocal.com is Chicago local newstabletmag.com is a Jewish news and culture site, and patheos.com is religious discussion focused on Christianitydrive.google.com, teespring.com and eventbrite.com are all apolitical platformy thingsTODO
subdomain_ideo_over_time.T.to_csv('data/subdomain_continuous_ideo_over_time.csv', index_label='subdomain')
for start_month, df in periodic_subdomain_sharer_ideo_summary.items():
df.to_csv(f'data/subdomain_continuous_ideo_details/{start_month}.csv', index_label='subdomain')