Updated Apr 22, 2019
Playing with fitting mixture models to URL sharing. Note that there are two distributions we could be fitting: either the ideological distribution of every Twitter account who shared the domain, or, the mean ideology of sharers of individual URLs within a domain, which we're doing here.
Table of Contents
%matplotlib inline
import sklearn.mixture, tqdm, lmfit.models, umap, scipy.optimize
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.offline as plotly
import plotly.graph_objs as go
df = pd.read_csv('../data/all_samples_combined/url_ideo_est_sharer_mean.csv', index_col=0)
df.sample(5)
def fit_site_mixture(df, site, n_components=2, plot=False):
site_mixture = sklearn.mixture.BayesianGaussianMixture(
n_components=n_components,
covariance_type='spherical',
max_iter=1000,
weight_concentration_prior=0.001
)
site_url_ideos = df[df['subdomain'] == site]['ideo_est'].dropna()
site_fitted = site_mixture.fit(site_url_ideos.values.reshape(-1, 1))
if plot:
fix, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
site_url_ideos.plot.hist(bins=100, alpha=0.5, ax=ax1, density=True)
df['ideo_est'].plot.hist(bins=100, alpha=0.5, ax=ax1, density=True)
#ax = site_url_ideos.plot.hist(bins=50, alpha=0.4)
ax1.vlines(site_fitted.means_, 0, ax1.get_ylim()[1], linestyles='dotted', alpha=0.5)
#ax.subplot()
print(site)
site_fit_params = pd.DataFrame({
'mean': site_fitted.means_.reshape(1, -1)[0],
'weight': site_fitted.weights_.reshape(1, -1)[0],
'covariance': site_fitted.covariances_.reshape(1, -1)[0]
}).sort_values('mean')
if plot:
ax2.set_ylim(0, 1)
sns.scatterplot(data=site_fit_params, x='mean', y='weight', ax=ax2)
return site_fit_params
fit_site_mixture(df, 'foxnews.com', plot=True)
fit_site_mixture(df, 'infowars.com', plot=True)
fit_site_mixture(df, 'youtube.com', plot=True)
fit_site_mixture(df, 'youtube.com', n_components=3, plot=True)
_ = df['ideo_est'].plot.hist(bins=100, alpha=0.5, density=True)
df['ideo_est'].describe()
RANGE = (-2.0, 2.6)
CENTER = 0.4
LEFT_CENTER = -1
RIGHT_CENTER = 1.6
def index_of(arrval, value):
"""Return index of array *at or below* value."""
if value < min(arrval):
return 0
return max(np.where(arrval <= value)[0])
def site_to_function_hist(df, site):
site_url_ideos = df[df['subdomain'] == site]['ideo_est'].dropna()
hist, bin_edges = np.histogram(site_url_ideos, bins='auto', density=True, range=RANGE)
if len(bin_edges) <= 30:
hist, bin_edges = np.histogram(site_url_ideos, bins=30, density=True, range=RANGE)
bin_centers = pd.Series(bin_edges).rolling(2).mean().dropna().values
return (bin_centers, hist)
def report(site, result):
print(site)
print(result.fit_report(min_correl=0.5))
def plot(site, x, y, init, result):
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(4, 6))
_, sample_at = np.histogram([1], bins=200, density=True, range=RANGE)
ax1.set_title(site)
ax1.bar(x, y, color='purple', alpha=0.2, width=(RANGE[1] - RANGE[0]) / len(x))
ax1.plot(x, init, 'k--', alpha=0.1)
ax1.plot(sample_at, result.eval(x=sample_at), 'orange', alpha=0.6)
comps = result.eval_components(x=sample_at)
ax2.plot(x, y, color='purple', alpha=0.3)
ax2.plot(sample_at, comps['left_'], 'b--', alpha=0.4)
ax2.plot(sample_at, comps['center_'], 'g--', alpha=0.4)
ax2.plot(sample_at, comps['right_'], 'r--', alpha=0.4)
def fit_skewed_normal_mixture(x, y):
breakpoints = [RANGE[0], -0.4, 0.0, 0.8, 1.2, RANGE[1]]
ix_breakpoints = [index_of(x, b) for b in breakpoints]
left_mod = lmfit.models.SkewedGaussianModel(prefix='left_')
pars = left_mod.guess(y[:ix_breakpoints[1]], x=x[:ix_breakpoints[1]])
pars['left_amplitude'].set(min=0, max=10)
pars['left_center'].set(min=breakpoints[0], max=breakpoints[1])
pars['left_gamma'].set(min=-8, max=8)
pars['left_sigma'].set(min=0.1, max=10)
center_mod = lmfit.models.GaussianModel(prefix='center_')
pars.update(center_mod.guess(y[ix_breakpoints[2]:ix_breakpoints[3]], x=x[ix_breakpoints[2]:ix_breakpoints[3]]))
pars['center_amplitude'].set(min=0, max=10)
pars['center_center'].set(min=breakpoints[2], max=breakpoints[3])
pars['center_sigma'].set(min=0.1, max=10)
right_mod = lmfit.models.SkewedGaussianModel(prefix='right_')
pars.update(right_mod.guess(y[ix_breakpoints[4]:], x=x[ix_breakpoints[4]:]))
pars['right_amplitude'].set(min=0, max=10)
pars['right_center'].set(min=breakpoints[4], max=breakpoints[5])
pars['right_gamma'].set(min=-8, max=8)
pars['right_sigma'].set(min=0.1, max=10)
mod = left_mod + center_mod + right_mod
init = mod.eval(pars, x=x)
out = mod.fit(y, pars, x=x, method='least_squares')
return (init, out)
def fit_site(data, site, show_report=False, show_plot=False):
x, y = site_to_function_hist(data, site)
init, result = fit_skewed_normal_mixture(x, y)
if show_report:
report(site, result)
if show_plot:
plot(site, x, y, init, result)
return result
I want to explain some of the choices in the function above.
First, the histogram stuff. LMFit is meant to fit functions. I don't know how to turn a single vector into an x -> y function to fit a univariate distribution, so I just made a histogram. Numpy has stuff built in for picking the number of bins, so I let it do that. But, sometimes it would pick very few bins (~10), and that could cause the fits to go crazy, so I set a minimum of 30. Maybe a KDE is a better choice now that I think about it? Or maybe there's something else?
The breakpoints: What I was observing when fitting at first is that the various components would slip all around and eat bits of each other. Specifically, the "center" would often have its tails in the left and right, which would cause underestimates for those components. Sometimes, the left or right would jump to the other side. To tame this behavior, I'm setting non-overlapping bounds (with a margin between them) for the components as well as initializing the component params with guesses from only the data within those bounds. That cleaned up a lot of the undesirable behavior, but I don't know if it's too prescriptive.
Same with the other bounds. Some of the optimization methods I was experimenting with required finite bounds, so I set what I think are reasonable bounds on everything, but they may be too strong.
model_outputs = {}
for site in tqdm.tqdm_notebook(df.groupby('subdomain').count().query('domain >= 50').index.values, smoothing=0):
model_outputs[site] = fit_site(df, site)
Here's an example of what the model output looks like. The text is the report that LMFit provides. The first graph shows three things: the raw histogram data in purple, the initialized model before fitting in dotted gray, and the fit model in orange. The second graph shows four things: the histogram data in purple, the left component in dotted blue, the center component in dotted green, and the right component in dotted red. We can see that for YouTube, it fit a sizable left and right component and a broad center.
fit_site(df, 'youtube.com', show_plot=True, show_report=True)
Let's see what the components look like for a bunch of outlets we're familiar with.
news_media_domains = [
'aljazeera.com', 'americanthinker.com', 'bbc.com',
'bbc.co.uk', 'bloomberg.com', 'bostonglobe.com', 'breitbart.com',
'buzzfeed.com', 'cbc.ca', 'cbsnews.com', 'chicagotribune.com', 'cnbc.com',
'cnn.com', 'csmonitor.com', 'dailycaller.com', 'dailykos.com',
'dailymail.co.uk', 'economist.com', 'forbes.com', 'foreignpolicy.com',
'fortune.com', 'foxnews.com', 'haaretz.com', 'hindustantimes.com',
'huffingtonpost.com', 'huffpost.com', 'independent.co.uk', 'infowars.com',
'latimes.com', 'miamiherald.com', 'motherjones.com', 'msnbc.com',
'nationalreview.com', 'nbcnews.com', 'newsweek.com', 'newyorker.com',
'npr.org', 'nydailynews.com', 'nypost.com', 'nytimes.com', 'pbs.org',
'politico.com', 'propublica.org', 'realclearpolitics.com','reuters.com',
'rollcall.com', 'rt.com', 'salon.com', 'news.sky.com', 'slate.com',
'sputniknews.com', 'theatlantic.com', 'theguardian.com', 'thehill.com',
'time.com', 'usatoday.com', 'vox.com', 'washingtonpost.com',
'washingtontimes.com', 'weeklystandard.com', 'westernjournal.com', 'wsj.com',
'zerohedge.com',
]
for site in tqdm.tqdm_notebook(news_media_domains):
fit_site(df, site, show_plot=True)
When looking through some of the graphs and fits, I started collecting subdomains that had distributions that caused them to fit poorly. Here are two graphs for all of these subdomains. It's useful to look through this set for craziness. Some still look wonky here, but most fits seem reasonable.
outliers = ['swingleft.org', 'nationalpost.com', 'rt.com', 'news.sky.com', 'thebiafratelegraph.co', 'wsbtv.com',
'english.alarabiya.net', 'dispatch.com', 'commun.it', 'breaking911.com', 'wjla.com',
'elpais.com', 'ktla.com', 'amnesty.org', 'lapatilla.com', 'deadspin.com', 'teaparty.org']
for site in tqdm.tqdm_notebook(outliers):
fit_site(df, site, show_plot=True)
Let's get a feel for what the estimated model params look like. One thing I noticed when poking through these is that when one component was estimated to have a contribution close to zero, the parameters for that component were all over the place. That makes sense, but it also makes looking at things in aggregate much noisier. To actually drop out components that have very small contributions, I'm filtering based on the "height" param, which is not really a param but a calculated value based on sigma and amplitude that's easier to interpret.
HEIGHT_CUTOFF = 0.01
def filter_params(params):
vector = {}
prefixes = ['left_', 'center_', 'right_']
params_for_vector = ['amplitude', 'center', 'gamma', 'sigma']
for prefix in prefixes:
if params[prefix + 'height'] < HEIGHT_CUTOFF:
for param in params_for_vector:
name = prefix + param
if name == 'center_gamma':
continue
vector[name] = np.NaN
else:
for param in params_for_vector:
name = prefix + param
if name == 'center_gamma':
continue
vector[name] = params[name].value
return vector
model_params = pd.DataFrame({s: filter_params(model_outputs[s].params) for s in model_outputs}).T
model_params.head()
model_params.describe()
Observations
Let's graph this stuff.
_ = model_params.loc[:,['left_center', 'center_center', 'right_center']].plot.hist(bins=50)
Observations
_ = model_params.loc[:,['left_amplitude', 'center_amplitude', 'right_amplitude']].plot.hist(bins=50, subplots=True, figsize=(5, 8))
Observations
_ = model_params.loc[:,['left_sigma', 'center_sigma', 'right_sigma']].plot.hist(bins=50, subplots=True, figsize=(5, 8), logy=True)
Observations
_ = model_params.loc[:,['left_gamma', 'right_gamma']].plot.hist(bins=50, alpha=0.5)
_ = sns.pairplot(model_params.drop('tov1.net'))
Let's see if I can state what we've generated here. For a given subdomain, we have the mean ideology of politically engaged Twitter sharers of each URL. We now have 11 (or fewer) parameters for each subdomain that summarize the distribution of those URL means. By measuring the difference between the parameter vectors for two different subdomains, we can see how similar those subdomains are in terms of ideological sharing.
To do all those comparisons at once and visualize it, we can do dimensionality reduction on the 11 dimensional parameter space. That's what I'm doing here. I'm making two graphs. One where to find it's position in the graph, each subdomain only looks at its 15 closest neighbors, and one where each subdomain looks at every other subdomain. As the documentation for UMAP puts it:
This means that low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture), while large values will push UMAP to look at larger neighborhoods of each point when estimating the manifold structure of the data, losing fine detail structure for the sake of getting the broader of the data.
So I'm doing both.
Similar to above, parameters for components that have very little contribution are getting set to default values (because settings them to NaN upsets dimensionality reduction).
DEFAULTS = {
'left_height': 0.0,
'center_height': 0.0,
'right_height': 0.0,
'left_center': model_params['left_center'].mean(),
'center_center': model_params['center_center'].mean(),
'right_center': model_params['right_center'].mean(),
}
def model_result_to_param_vector(result, height_cutoff):
params = result.params
vector = {}
prefixes = ['left_', 'center_', 'right_']
params_for_vector = ['height', 'center']
for prefix in prefixes:
for param in params_for_vector:
name = prefix + param
if params[prefix + 'height'] < height_cutoff:
vector[name] = DEFAULTS[name]
else:
vector[name] = params[name].value
return vector
def model_results_to_subdomain_vectors(results, height_cutoff=HEIGHT_CUTOFF):
subdomain_vectors = pd.DataFrame({site: model_result_to_param_vector(output, height_cutoff) for site, output in model_outputs.items()}).T
scaled = (subdomain_vectors - subdomain_vectors.mean()) / subdomain_vectors.std()
return subdomain_vectors, scaled
subdomain_vectors, normed_vectors = model_results_to_subdomain_vectors(model_outputs)
subdomain_info = pd.DataFrame({
'mean_url_ideo': df.groupby('subdomain')['ideo_est'].mean(),
'num_shares': df.groupby('subdomain')['num_sharers'].sum(),
'num_urls': df.groupby('subdomain')['domain'].count()
}, index=subdomain_vectors.index)
def measure_left_right_alignment(coords):
# Spearman correlation between x values and ideology
ideo_in_x_order = pd.DataFrame(coords, index=subdomain_info.index).join(subdomain_info).sort_values(1)['mean_url_ideo']
idx_in_x_order = pd.Series(range(ideo_in_x_order.shape[0]), index=ideo_in_x_order.index)
return ideo_in_x_order.rank().corr(idx_in_x_order)
def rotate(df, theta):
transform = np.array([[np.cos(theta), -np.sin(theta)],
[np.sin(theta), np.cos(theta)]])
return transform.dot(df)
def rotation_to_left_right_alignment(theta, coords):
return -1 * measure_left_right_alignment(rotate(coords.T, theta).T)
def rotate_to_optimal_alignment(coords):
# I think there's some way of doing this analytically, but it's fast and easy to just optimize it.
optimal_rotation = scipy.optimize.minimize_scalar(rotation_to_left_right_alignment, bounds=(0, np.pi*2), args=(coords)).x
rotated = pd.DataFrame(rotate(coords.T, optimal_rotation).T)
normed_and_swapped = ((rotated - rotated.mean()) / rotated.std()).loc[:,[1,0]].values
return reflect_if_necessary(normed_and_swapped)
def reflect_if_necessary(coords):
# It seems like just reflecting every time is the best policy. I don't know why.
#is_top_heavy = ((coords[:,1] > 0).sum() / coords.shape[0]) > 0.5
#if coords[coords[:,1] > 0].mean() < 0:
#if is_top_heavy:
if False:
#print(f'No Reflect: {coords[coords[:,1] > 0].mean()}')
return coords
else:
#print(f'Reflect: {coords[coords[:,1] > 0].mean()}')
return coords * [1, -1]
def plot_subdomain_embedding(coords):
rotated_coords = rotate_to_optimal_alignment(coords)
scatter2 = go.Scattergl(
x=rotated_coords[:, 0],
y=rotated_coords[:, 1],
mode='markers',
text=subdomain_info.index,
hoverinfo='text',
marker=dict(
color=subdomain_info['mean_url_ideo'],
cmin=model_params['left_center'].min() + 0.3,
cmid=model_params['center_center'].mean(),
cmax=model_params['right_center'].max() - 0.3,
size=subdomain_info['num_shares'] / 1000,
sizemode='area',
sizemin=2.2
)
)
layout2 = go.Layout(
title ='Domain Similarity by Distribution of Audience Ideology',
hovermode = 'closest',
xaxis = dict(visible = False),
yaxis = dict(visible = False),
width = 800,
height = 800
)
plotly.iplot(go.Figure(data=[scatter2], layout=layout2))
subdomain_vectors_embedded_15nn[subdomain_vectors_embedded_15nn[:,1] > 0].mean()
subdomain_vectors_embedded_15nn = umap.UMAP(random_state=1, n_neighbors=15).fit_transform(normed_vectors)
plot_subdomain_embedding(subdomain_vectors_embedded_15nn)
Observations
subdomain_vectors_embedded_50nn = umap.UMAP(random_state=1, n_neighbors=50).fit_transform(normed_vectors)
plot_subdomain_embedding(subdomain_vectors_embedded_50nn)
Now we're looking at 50 neighbors instead of 15.
Observations
subdomain_vectors_embedded_allnn = umap.UMAP(random_state=1, n_neighbors=subdomain_vectors.shape[0]-1).fit_transform(normed_vectors)
plot_subdomain_embedding(subdomain_vectors_embedded_allnn)
Now we're looking at all neighbors.
Observations
To get a feel for what some of these features might represent in the parameter (and URL ideo space), I'm going to pull two samples from each topographic feature.
Large sites on the far right vs large right sites closer to center:
sites = ['foxnews.com', 'thefederalist.com',
'breitbart.com', 'truepundit.com']
for site in sites:
fit_site(df, site, show_plot=True)
subdomain_vectors.loc[sites,:]
Observations
That island in the center vs sites just off the island:
sites = ['thebiafratelegraph.co', 'lapatilla.com',
'patreon.com', 'cato.org']
for site in sites:
fit_site(df, site, show_plot=True)
subdomain_vectors.loc[sites,:]
Observations
Far end of the blue spike vs. near side of the blue spike:
sites = ['thinkprogress.org', 'actblue.com',
'rawstory.com', 'newyorker.com']
for site in sites:
fit_site(df, site, show_plot=True)
subdomain_vectors.loc[sites,:]
Observations
There are a number of parameters to the dimensionality reduction algorithm, and something that would be nice is seeing how the graph shifts as those parameters change. One nice way of seeing that is by animating the graph. Animating the graph is somewhat useful in this context, but will be very useful when we're looking at things longitudinally, so I feel OK spending some time on it.
The thing that's difficult about this is finding the balance between making the graph stable from frame the frame and making the distances between nodes as meaningful as possible. I've addressed this in four ways:
Some of this is taken from here.
It just renders the last frame here. The actual animations are here and here.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcol
from matplotlib import animation
%matplotlib inline
import seaborn as sns
import itertools
sns.set(style='white', rc={'figure.figsize':(14, 12)})
def tween(e1, e2, n_frames=60):
for i in range(20):
yield e1
for i in range(n_frames):
alpha = i / float(n_frames - 1)
yield (1 - alpha) * e1 + alpha * e2
for i in range(20):
yield(e2)
return
def generate_frame_coords(data, height_cutoff=HEIGHT_CUTOFF, **kwargs):
_, subdomain_vectors = model_results_to_subdomain_vectors(data, height_cutoff)
subdomain_vectors_embedded = umap.UMAP(random_state=1, **kwargs).fit_transform(subdomain_vectors)
return rotate_to_optimal_alignment(subdomain_vectors_embedded)
def generate_frame_data(data, arg_name='n_neighbors', arg_list=[3,4,5,7,9,11,15,20,25,30,40,60,100], **kwargs):
result = []
es = []
for arg in arg_list:
kwargs[arg_name] = arg
if len(es) > 0:
es.append(generate_frame_coords(data, init=es[-1], **kwargs))
else:
es.append(generate_frame_coords(data, **kwargs))
for e1, e2 in zip(es[:-1], es[1:]):
result.extend(list(tween(e1, e2)))
return result
def create_animation(frame_data, arg_name='n_neighbors', arg_list=[3,4,5,7,9,11,15,20,25,30,40,60,100]):
fig = plt.figure()
all_data = np.vstack(frame_data)
#pol_cm = mcol.LinearSegmentedColormap.from_list("Pol",["#3771f3", "#b147cc", "#d62222"])
pol_cm = mcol.LinearSegmentedColormap.from_list("Pol",["#3771f3", "#cccccc", "#d62222"])
frame_bounds = (all_data[:, 0].min() * 1.1,
all_data[:, 0].max() * 1.1,
all_data[:, 1].min() * 1.1,
all_data[:, 1].max() * 1.1)
ax = plt.axes(xlim=(frame_bounds[0], frame_bounds[1]),
ylim=(frame_bounds[2], frame_bounds[3]))
points = plt.scatter(frame_data[0][:, 0], frame_data[0][:, 1],
#s=5,
s=[max(10, s) for s in (subdomain_info['num_shares'] / 1000)],
c=subdomain_info['mean_url_ideo'],
cmap=pol_cm,
alpha=0.8,
edgecolors='white',
animated=True)
title = plt.title('', fontsize=24)
ax.set_xticks([])
ax.set_yticks([])
#cbar = plt.colorbar(values=np.arange(10), boundaries=np.arange(11)-0.5, ticks=np.arange(10), drawedges=True)
#cbar.ax.yaxis.set_ticklabels(np.arange(10), fontsize=18)
def init():
points.set_offsets(frame_data[0])
title.set_text('UMAP with {}={}'.format(arg_name, arg_list[0]))
return points,
def animate(i):
points.set_offsets(frame_data[i])
if (i + 15) % 100 == 0:
title.set_text('UMAP with {}={}'.format(arg_name, arg_list[(i + 15) // 100]))
return points,
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(frame_data), interval=20, blit=True)
anim.save('umap_anim-{}.mp4'.format(arg_name), writer='ffmpeg', fps=30)
def animate_param(data, arg_name='n_neighbors', arg_list=[3,4,5,7,9,11,15,20,25,30,40,60,100], **kwargs):
frame_data = generate_frame_data(data, arg_name, arg_list, **kwargs)
create_animation(frame_data, arg_name, arg_list)
%%time
#animate_param(model_outputs, 'n_neighbors', [3,15,100,954])
animate_param(model_outputs, 'n_neighbors', [3,4,5,7,9,11,15,30,60,100,200,300,400,500,600,700,800,954])
%%time
#animate_param(model_outputs, 'height_cutoff', [0, 0.1])
animate_param(model_outputs, 'height_cutoff', [0, 0.0001, 0.001, 0.005, 0.007, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.5], n_neighbors=954)
reuters.com