Gaussian Mixture Experimentation

Updated Apr 22, 2019

Playing with fitting mixture models to URL sharing. Note that there are two distributions we could be fitting: either the ideological distribution of every Twitter account who shared the domain, or, the mean ideology of sharers of individual URLs within a domain, which we're doing here.

Table of Contents

In [1]:
%matplotlib inline

import sklearn.mixture, tqdm, lmfit.models, umap, scipy.optimize

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.offline as plotly
import plotly.graph_objs as go
In [2]:
df = pd.read_csv('../data/all_samples_combined/url_ideo_est_sharer_mean.csv', index_col=0)
In [3]:
df.sample(5)
Out[3]:
ideo_est stddev_sharer_ideo num_sharers subdomain domain
https://miami.cbslocal.com/2018/06/05/parent-parkland-victim-agrees-devos/ 1.1690 0.6484 4 miami.cbslocal.com cbslocal.com
https://news.grabien.com/story-left-wing-twitter-celebrates-shooting-rep-scalise 1.4580 0.5186 31 news.grabien.com grabien.com
https://www.westernjournal.com/trump-makes-good-on-promise-to-add-citizenship-question-to-census/ 1.8060 0.5044 3 westernjournal.com westernjournal.com
http://www.huffingtonpost.com/entry/kathy-griffin-jeffrey-mezger-kb-home_us_59c145aee4b0186c22061a67 -1.0490 0.3320 61 huffingtonpost.com huffingtonpost.com
https://www.nbcnews.com/news/us-news/michigan-judge-gives-convicted-rapist-parental-rights-victim-s-son-n809196 -0.7754 0.2780 6 nbcnews.com nbcnews.com

Gaussian Mixtures

In [4]:
def fit_site_mixture(df, site, n_components=2, plot=False):
    site_mixture = sklearn.mixture.BayesianGaussianMixture(
        n_components=n_components,
        covariance_type='spherical',
        max_iter=1000,
        weight_concentration_prior=0.001
    )
    site_url_ideos = df[df['subdomain'] == site]['ideo_est'].dropna()
    site_fitted = site_mixture.fit(site_url_ideos.values.reshape(-1, 1))
    if plot:
        fix, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
        site_url_ideos.plot.hist(bins=100, alpha=0.5, ax=ax1, density=True)
        df['ideo_est'].plot.hist(bins=100, alpha=0.5, ax=ax1, density=True)
        #ax = site_url_ideos.plot.hist(bins=50, alpha=0.4)
        ax1.vlines(site_fitted.means_, 0, ax1.get_ylim()[1], linestyles='dotted', alpha=0.5)
        #ax.subplot()
        print(site)
    site_fit_params = pd.DataFrame({
        'mean': site_fitted.means_.reshape(1, -1)[0],
        'weight': site_fitted.weights_.reshape(1, -1)[0],
        'covariance': site_fitted.covariances_.reshape(1, -1)[0]
    }).sort_values('mean')
    if plot:
        ax2.set_ylim(0, 1)
        sns.scatterplot(data=site_fit_params, x='mean', y='weight', ax=ax2)
    return site_fit_params
In [5]:
fit_site_mixture(df, 'foxnews.com', plot=True)
foxnews.com
Out[5]:
mean weight covariance
1 0.716782 0.169435 0.406151
0 1.412040 0.830565 0.039177
In [6]:
fit_site_mixture(df, 'infowars.com', plot=True)
infowars.com
Out[6]:
mean weight covariance
0 0.973249 0.03541 0.376458
1 1.533751 0.96459 0.033451
In [7]:
fit_site_mixture(df, 'youtube.com', plot=True)
youtube.com
Out[7]:
mean weight covariance
0 -0.623965 0.414294 0.282220
1 1.397932 0.585706 0.088941
In [8]:
fit_site_mixture(df, 'youtube.com', n_components=3, plot=True)
youtube.com
Out[8]:
mean weight covariance
0 -0.980238 0.235818 0.030550
1 0.167216 0.250279 0.446781
2 1.458559 0.513904 0.056451

Using Skewed Gaussians with LMFit

Many of the distributions don't look like straight Gaussians - they look like skewed Gaussians. To fit a mixture of skewed Gaussians, we're going to use LMFit.

In [9]:
_ = df['ideo_est'].plot.hist(bins=100, alpha=0.5, density=True)
In [10]:
df['ideo_est'].describe()
Out[10]:
count    843723.000000
mean         -0.028966
std           1.151868
min          -1.587000
25%          -1.027000
50%          -0.712400
75%           1.368000
max           2.521000
Name: ideo_est, dtype: float64
In [11]:
RANGE = (-2.0, 2.6)
CENTER = 0.4
LEFT_CENTER = -1
RIGHT_CENTER = 1.6
In [12]:
def index_of(arrval, value):
    """Return index of array *at or below* value."""
    if value < min(arrval):
        return 0
    return max(np.where(arrval <= value)[0])

def site_to_function_hist(df, site):
    site_url_ideos = df[df['subdomain'] == site]['ideo_est'].dropna()
    hist, bin_edges = np.histogram(site_url_ideos, bins='auto', density=True, range=RANGE)
    if len(bin_edges) <= 30:
        hist, bin_edges = np.histogram(site_url_ideos, bins=30, density=True, range=RANGE)
    bin_centers = pd.Series(bin_edges).rolling(2).mean().dropna().values
    return (bin_centers, hist)
        
def report(site, result):
    print(site)
    print(result.fit_report(min_correl=0.5))
        
def plot(site, x, y, init, result):
    fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(4, 6))
        
    _, sample_at = np.histogram([1], bins=200, density=True, range=RANGE)

    ax1.set_title(site)
    ax1.bar(x, y, color='purple', alpha=0.2, width=(RANGE[1] - RANGE[0]) / len(x))
    ax1.plot(x, init, 'k--', alpha=0.1)
    ax1.plot(sample_at, result.eval(x=sample_at), 'orange', alpha=0.6)

    comps = result.eval_components(x=sample_at)
    ax2.plot(x, y, color='purple', alpha=0.3)
    ax2.plot(sample_at, comps['left_'], 'b--', alpha=0.4)
    ax2.plot(sample_at, comps['center_'], 'g--', alpha=0.4)
    ax2.plot(sample_at, comps['right_'], 'r--', alpha=0.4)
        
def fit_skewed_normal_mixture(x, y):
    breakpoints = [RANGE[0], -0.4, 0.0, 0.8, 1.2, RANGE[1]]
    ix_breakpoints = [index_of(x, b) for b in breakpoints]
    
    left_mod = lmfit.models.SkewedGaussianModel(prefix='left_')
    pars = left_mod.guess(y[:ix_breakpoints[1]], x=x[:ix_breakpoints[1]])
    pars['left_amplitude'].set(min=0, max=10)
    pars['left_center'].set(min=breakpoints[0], max=breakpoints[1])
    pars['left_gamma'].set(min=-8, max=8)
    pars['left_sigma'].set(min=0.1, max=10)

    center_mod = lmfit.models.GaussianModel(prefix='center_')
    pars.update(center_mod.guess(y[ix_breakpoints[2]:ix_breakpoints[3]], x=x[ix_breakpoints[2]:ix_breakpoints[3]]))
    pars['center_amplitude'].set(min=0, max=10)
    pars['center_center'].set(min=breakpoints[2], max=breakpoints[3])
    pars['center_sigma'].set(min=0.1, max=10)


    right_mod = lmfit.models.SkewedGaussianModel(prefix='right_')
    pars.update(right_mod.guess(y[ix_breakpoints[4]:], x=x[ix_breakpoints[4]:]))
    pars['right_amplitude'].set(min=0, max=10)
    pars['right_center'].set(min=breakpoints[4], max=breakpoints[5])
    pars['right_gamma'].set(min=-8, max=8)
    pars['right_sigma'].set(min=0.1, max=10)

    mod = left_mod + center_mod + right_mod
    
    init = mod.eval(pars, x=x)
    out = mod.fit(y, pars, x=x, method='least_squares')
       
    return (init, out)

def fit_site(data, site, show_report=False, show_plot=False):
    x, y = site_to_function_hist(data, site)
    init, result = fit_skewed_normal_mixture(x, y)
    if show_report:
        report(site, result)
    if show_plot:
        plot(site, x, y, init, result)
    return result

I want to explain some of the choices in the function above.

First, the histogram stuff. LMFit is meant to fit functions. I don't know how to turn a single vector into an x -> y function to fit a univariate distribution, so I just made a histogram. Numpy has stuff built in for picking the number of bins, so I let it do that. But, sometimes it would pick very few bins (~10), and that could cause the fits to go crazy, so I set a minimum of 30. Maybe a KDE is a better choice now that I think about it? Or maybe there's something else?

The breakpoints: What I was observing when fitting at first is that the various components would slip all around and eat bits of each other. Specifically, the "center" would often have its tails in the left and right, which would cause underestimates for those components. Sometimes, the left or right would jump to the other side. To tame this behavior, I'm setting non-overlapping bounds (with a margin between them) for the components as well as initializing the component params with guesses from only the data within those bounds. That cleaned up a lot of the undesirable behavior, but I don't know if it's too prescriptive.

Same with the other bounds. Some of the optimization methods I was experimenting with required finite bounds, so I set what I think are reasonable bounds on everything, but they may be too strong.

In [13]:
model_outputs = {}
for site in tqdm.tqdm_notebook(df.groupby('subdomain').count().query('domain >= 50').index.values, smoothing=0):
    model_outputs[site] = fit_site(df, site)
/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/lmfit/minimizer.py:767: RuntimeWarning:

invalid value encountered in sqrt

/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/lmfit/minimizer.py:774: RuntimeWarning:

invalid value encountered in sqrt

/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/lmfit/minimizer.py:774: RuntimeWarning:

divide by zero encountered in double_scalars

/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/lmfit/minimizer.py:774: RuntimeWarning:

invalid value encountered in double_scalars


Example of Model Output

Here's an example of what the model output looks like. The text is the report that LMFit provides. The first graph shows three things: the raw histogram data in purple, the initialized model before fitting in dotted gray, and the fit model in orange. The second graph shows four things: the histogram data in purple, the left component in dotted blue, the center component in dotted green, and the right component in dotted red. We can see that for YouTube, it fit a sizable left and right component and a broad center.

In [14]:
fit_site(df, 'youtube.com', show_plot=True, show_report=True)
youtube.com
[[Model]]
    ((Model(skewed_gaussian, prefix='left_') + Model(gaussian, prefix='center_')) + Model(skewed_gaussian, prefix='right_'))
[[Fit Statistics]]
    # fitting method   = least_squares
    # function evals   = 21
    # data points      = 32
    # variables        = 11
    chi-square         = 0.00936065
    reduced chi-square = 4.4575e-04
    Akaike info crit   = -238.383260
    Bayesian info crit = -222.260165
[[Variables]]
    left_gamma:        8.00000000 +/- 63.3314011 (791.64%) (init = 0)
    left_sigma:        0.33029148 +/- 0.03415783 (10.34%) (init = 0.14375)
    left_center:      -1.20357536 +/- 0.04453502 (3.70%) (init = -0.921875)
    left_amplitude:    0.24053106 +/- 0.02935159 (12.20%) (init = 0.2525256)
    left_height:       0.29052525 +/- 0.01297690 (4.47%) == '0.3989423*left_amplitude/max(1.e-15, left_sigma)'
    left_fwhm:         0.77777698 +/- 0.08043553 (10.34%) == '2.3548200*left_sigma'
    center_sigma:      0.93457802 +/- 0.10744352 (11.50%) (init = 0.1)
    center_center:     0.32367464 +/- 0.14286354 (44.14%) (init = 0.515625)
    center_amplitude:  0.29468442 +/- 0.03395982 (11.52%) (init = 0.007723213)
    center_fwhm:       2.20076302 +/- 0.25301016 (11.50%) == '2.3548200*center_sigma'
    center_height:     0.12579162 +/- 0.00879949 (7.00%) == '0.3989423*center_amplitude/max(1.e-15, center_sigma)'
    right_gamma:      -2.14170739 +/- 0.27918074 (13.04%) (init = 0)
    right_sigma:       0.33624729 +/- 0.01675407 (4.98%) (init = 0.215625)
    right_center:      1.69075209 +/- 0.01098904 (0.65%) (init = 1.45)
    right_amplitude:   0.47546475 +/- 0.01973150 (4.15%) (init = 0.6143465)
    right_height:      0.56411756 +/- 0.02342599 (4.15%) == '0.3989423*right_amplitude/max(1.e-15, right_sigma)'
    right_fwhm:        0.79180185 +/- 0.03945282 (4.98%) == '2.3548200*right_sigma'
[[Correlations]] (unreported correlations are < 0.500)
    C(left_gamma, left_center)           = -0.996
    C(left_sigma, left_amplitude)        =  0.935
    C(left_center, left_amplitude)       = -0.891
    C(left_gamma, left_amplitude)        =  0.881
    C(right_sigma, right_center)         =  0.879
    C(left_sigma, left_center)           = -0.874
    C(left_gamma, left_sigma)            =  0.857
    C(center_sigma, center_amplitude)    =  0.815
    C(right_gamma, right_center)         = -0.806
    C(right_gamma, right_sigma)          = -0.778
    C(center_center, right_amplitude)    = -0.735
    C(center_amplitude, right_amplitude) = -0.647
    C(right_sigma, right_amplitude)      =  0.600
    C(center_sigma, right_amplitude)     = -0.502

Out[14]:
<lmfit.model.ModelResult at 0x7fa0714537f0>

Common News Media Domains

Let's see what the components look like for a bunch of outlets we're familiar with.

In [15]:
news_media_domains = [
    'aljazeera.com', 'americanthinker.com', 'bbc.com',
    'bbc.co.uk', 'bloomberg.com', 'bostonglobe.com', 'breitbart.com',
    'buzzfeed.com', 'cbc.ca', 'cbsnews.com', 'chicagotribune.com', 'cnbc.com',
    'cnn.com', 'csmonitor.com', 'dailycaller.com', 'dailykos.com',
    'dailymail.co.uk', 'economist.com', 'forbes.com', 'foreignpolicy.com',
    'fortune.com', 'foxnews.com', 'haaretz.com', 'hindustantimes.com',
    'huffingtonpost.com', 'huffpost.com', 'independent.co.uk', 'infowars.com',
    'latimes.com', 'miamiherald.com', 'motherjones.com', 'msnbc.com',
    'nationalreview.com', 'nbcnews.com', 'newsweek.com', 'newyorker.com',
    'npr.org', 'nydailynews.com', 'nypost.com', 'nytimes.com', 'pbs.org',
    'politico.com', 'propublica.org', 'realclearpolitics.com','reuters.com',
    'rollcall.com', 'rt.com', 'salon.com', 'news.sky.com', 'slate.com',
    'sputniknews.com', 'theatlantic.com', 'theguardian.com', 'thehill.com',
    'time.com', 'usatoday.com', 'vox.com', 'washingtonpost.com',
    'washingtontimes.com', 'weeklystandard.com', 'westernjournal.com', 'wsj.com',
    'zerohedge.com',
]
for site in tqdm.tqdm_notebook(news_media_domains):
    fit_site(df, site, show_plot=True)
/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/matplotlib/pyplot.py:514: RuntimeWarning:

More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).


Outliers

When looking through some of the graphs and fits, I started collecting subdomains that had distributions that caused them to fit poorly. Here are two graphs for all of these subdomains. It's useful to look through this set for craziness. Some still look wonky here, but most fits seem reasonable.

In [16]:
outliers = ['swingleft.org', 'nationalpost.com', 'rt.com', 'news.sky.com', 'thebiafratelegraph.co', 'wsbtv.com',
               'english.alarabiya.net', 'dispatch.com', 'commun.it', 'breaking911.com', 'wjla.com',
              'elpais.com', 'ktla.com', 'amnesty.org', 'lapatilla.com', 'deadspin.com', 'teaparty.org']
for site in tqdm.tqdm_notebook(outliers):
    fit_site(df, site, show_plot=True)

Looking at Estimated Model Params

Let's get a feel for what the estimated model params look like. One thing I noticed when poking through these is that when one component was estimated to have a contribution close to zero, the parameters for that component were all over the place. That makes sense, but it also makes looking at things in aggregate much noisier. To actually drop out components that have very small contributions, I'm filtering based on the "height" param, which is not really a param but a calculated value based on sigma and amplitude that's easier to interpret.

In [17]:
HEIGHT_CUTOFF = 0.01
def filter_params(params):
    vector = {}
    prefixes = ['left_', 'center_', 'right_']
    params_for_vector = ['amplitude', 'center', 'gamma', 'sigma']
    for prefix in prefixes:
        if params[prefix + 'height'] < HEIGHT_CUTOFF:
            for param in params_for_vector:
                name = prefix + param
                if name == 'center_gamma':
                    continue
                vector[name] = np.NaN
        else:
            for param in params_for_vector:
                name = prefix + param
                if name == 'center_gamma':
                    continue
                vector[name] = params[name].value
    return vector
model_params = pd.DataFrame({s: filter_params(model_outputs[s].params) for s in model_outputs}).T
model_params.head()
Out[17]:
center_amplitude center_center center_sigma left_amplitude left_center left_gamma left_sigma right_amplitude right_center right_gamma right_sigma
100percentfedup.com 0.066479 8.000000e-01 0.813362 NaN NaN NaN NaN 0.935212 1.479029 0.818352 0.177143
5calls.org NaN NaN NaN 0.972715 -1.188330 1.966054 0.162072 NaN NaN NaN NaN
710wor.iheart.com NaN NaN NaN NaN NaN NaN NaN 0.985807 1.265912 2.757018 0.322619
972mag.com 0.045738 8.510383e-16 0.593867 0.954454 -1.084859 1.943607 0.283890 NaN NaN NaN NaN
a.msn.com 0.149061 1.234982e-24 0.499475 0.823632 -1.272251 4.430473 0.387927 0.029237 1.759049 -1.942950 0.379626
In [18]:
model_params.describe()
Out[18]:
center_amplitude center_center center_sigma left_amplitude left_center left_gamma left_sigma right_amplitude right_center right_gamma right_sigma
count 841.000000 8.410000e+02 841.000000 770.000000 770.000000 770.000000 770.000000 798.000000 798.000000 798.000000 798.000000
mean 0.153523 3.013677e-01 0.547852 0.644527 -1.084550 3.152955 0.351399 0.423531 1.514975 -1.831365 0.569797
std 0.171731 3.345446e-01 0.421705 0.312639 0.201520 3.658481 0.794929 0.376875 0.202193 4.385988 1.395454
min 0.002641 9.629165e-60 0.100000 0.002864 -1.999989 -8.000000 0.100000 0.002909 1.200000 -8.000000 0.100000
25% 0.044713 7.777703e-28 0.145330 0.428141 -1.190103 1.505105 0.168622 0.071647 1.334155 -4.666335 0.187903
50% 0.096993 1.191825e-01 0.456160 0.759042 -1.151210 2.746345 0.225317 0.276023 1.563607 -1.679102 0.256901
75% 0.200246 6.931244e-01 0.871857 0.907670 -1.060156 6.244945 0.330076 0.880625 1.684680 0.112371 0.353079
max 1.032665 8.000000e-01 2.288671 1.434658 -0.400000 8.000000 10.000000 1.176488 2.025229 8.000000 10.000000

Observations

  • The left component drops out more than the right, and they both drop out less than the center.
  • The centers of each component aren't exactly where I initialized them, so that's good.
  • The left component is typically right skewed (positive gamma) and the right component is typically left skewed (negative gamma).

Let's graph this stuff.

In [19]:
_ = model_params.loc[:,['left_center', 'center_center', 'right_center']].plot.hist(bins=50)

Observations

  • All the components want to push against the bounds I set to some degree, but especially the center. That makes sense, though I don't know if it's OK.
  • The left's center has much less spread than the right's.
In [20]:
_ = model_params.loc[:,['left_amplitude', 'center_amplitude', 'right_amplitude']].plot.hist(bins=50, subplots=True, figsize=(5, 8))

Observations

  • There's rarely a strong center.
  • There are more lower amplitude estimates on the right than on the left.
In [21]:
_ = model_params.loc[:,['left_sigma', 'center_sigma', 'right_sigma']].plot.hist(bins=50, subplots=True, figsize=(5, 8), logy=True)

Observations

  • Some components are super wide. I don't know if those are desirable or if they're artefacts or something.
In [22]:
_ = model_params.loc[:,['left_gamma', 'right_gamma']].plot.hist(bins=50, alpha=0.5)
In [23]:
_ = sns.pairplot(model_params.drop('tov1.net'))
/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/numpy/lib/histograms.py:824: RuntimeWarning:

invalid value encountered in greater_equal

/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/numpy/lib/histograms.py:825: RuntimeWarning:

invalid value encountered in less_equal

Graphing Subdomain Similarity though Mixture Params

Let's see if I can state what we've generated here. For a given subdomain, we have the mean ideology of politically engaged Twitter sharers of each URL. We now have 11 (or fewer) parameters for each subdomain that summarize the distribution of those URL means. By measuring the difference between the parameter vectors for two different subdomains, we can see how similar those subdomains are in terms of ideological sharing.

To do all those comparisons at once and visualize it, we can do dimensionality reduction on the 11 dimensional parameter space. That's what I'm doing here. I'm making two graphs. One where to find it's position in the graph, each subdomain only looks at its 15 closest neighbors, and one where each subdomain looks at every other subdomain. As the documentation for UMAP puts it:

This means that low values of n_neighbors will force UMAP to concentrate on very local structure (potentially to the detriment of the big picture), while large values will push UMAP to look at larger neighborhoods of each point when estimating the manifold structure of the data, losing fine detail structure for the sake of getting the broader of the data.

So I'm doing both.

Similar to above, parameters for components that have very little contribution are getting set to default values (because settings them to NaN upsets dimensionality reduction).

In [32]:
DEFAULTS = {
    'left_height': 0.0,
    'center_height': 0.0,
    'right_height': 0.0,
    'left_center': model_params['left_center'].mean(),
    'center_center': model_params['center_center'].mean(),
    'right_center': model_params['right_center'].mean(),
}
def model_result_to_param_vector(result, height_cutoff):
    params = result.params
    vector = {}
    prefixes = ['left_', 'center_', 'right_']
    params_for_vector = ['height', 'center']
    for prefix in prefixes:
        for param in params_for_vector:
            name = prefix + param
            if params[prefix + 'height'] < height_cutoff:
                vector[name] = DEFAULTS[name]
            else:
                vector[name] = params[name].value    
    return vector

def model_results_to_subdomain_vectors(results, height_cutoff=HEIGHT_CUTOFF):
    subdomain_vectors = pd.DataFrame({site: model_result_to_param_vector(output, height_cutoff) for site, output in model_outputs.items()}).T
    scaled = (subdomain_vectors - subdomain_vectors.mean()) / subdomain_vectors.std()
    return subdomain_vectors, scaled
In [69]:
subdomain_vectors, normed_vectors = model_results_to_subdomain_vectors(model_outputs)
subdomain_info = pd.DataFrame({
    'mean_url_ideo': df.groupby('subdomain')['ideo_est'].mean(),
    'num_shares': df.groupby('subdomain')['num_sharers'].sum(),
    'num_urls': df.groupby('subdomain')['domain'].count()
}, index=subdomain_vectors.index)
In [101]:
def measure_left_right_alignment(coords):
    # Spearman correlation between x values and ideology
    ideo_in_x_order = pd.DataFrame(coords, index=subdomain_info.index).join(subdomain_info).sort_values(1)['mean_url_ideo']
    idx_in_x_order = pd.Series(range(ideo_in_x_order.shape[0]), index=ideo_in_x_order.index)
    return ideo_in_x_order.rank().corr(idx_in_x_order)

def rotate(df, theta):
    transform = np.array([[np.cos(theta), -np.sin(theta)],
                          [np.sin(theta), np.cos(theta)]])
    return transform.dot(df)

def rotation_to_left_right_alignment(theta, coords):
    return -1 * measure_left_right_alignment(rotate(coords.T, theta).T)

def rotate_to_optimal_alignment(coords):
    # I think there's some way of doing this analytically, but it's fast and easy to just optimize it.
    optimal_rotation = scipy.optimize.minimize_scalar(rotation_to_left_right_alignment, bounds=(0, np.pi*2), args=(coords)).x
    rotated = pd.DataFrame(rotate(coords.T, optimal_rotation).T)
    normed_and_swapped = ((rotated - rotated.mean()) / rotated.std()).loc[:,[1,0]].values
    return reflect_if_necessary(normed_and_swapped)
    
def reflect_if_necessary(coords):
    # It seems like just reflecting every time is the best policy. I don't know why.
    #is_top_heavy = ((coords[:,1] > 0).sum() / coords.shape[0]) > 0.5
    #if coords[coords[:,1] > 0].mean() < 0:
    #if is_top_heavy:
    if False:
        #print(f'No Reflect: {coords[coords[:,1] > 0].mean()}')
        return coords
    else:
        #print(f'Reflect: {coords[coords[:,1] > 0].mean()}')
        return coords * [1, -1]
    
def plot_subdomain_embedding(coords):
    rotated_coords = rotate_to_optimal_alignment(coords)
    scatter2 = go.Scattergl(
    x=rotated_coords[:, 0],
    y=rotated_coords[:, 1],
    mode='markers',
    text=subdomain_info.index,
    hoverinfo='text',
    marker=dict(
        color=subdomain_info['mean_url_ideo'],
        cmin=model_params['left_center'].min() + 0.3,
        cmid=model_params['center_center'].mean(),
        cmax=model_params['right_center'].max() - 0.3,
        size=subdomain_info['num_shares'] / 1000,
        sizemode='area',
        sizemin=2.2
        )
    )

    layout2 = go.Layout(
        title ='Domain Similarity by Distribution of Audience Ideology',
        hovermode = 'closest',
        xaxis = dict(visible = False),
        yaxis = dict(visible = False),
        width = 800,
        height = 800
    )
    plotly.iplot(go.Figure(data=[scatter2], layout=layout2))
In [93]:
subdomain_vectors_embedded_15nn[subdomain_vectors_embedded_15nn[:,1] > 0].mean()
Out[93]:
0.48460507
In [70]:
subdomain_vectors_embedded_15nn = umap.UMAP(random_state=1, n_neighbors=15).fit_transform(normed_vectors)
plot_subdomain_embedding(subdomain_vectors_embedded_15nn)

Observations

  • The mean (the color) is clearly important, but there is mixing in places.
  • The left has a weird peninsula sticking out.
  • There look to be two clusters on the right, one of which is connected to the left but has no large sites.
  • There's one cluster of sites far away from everything else.
In [71]:
subdomain_vectors_embedded_50nn = umap.UMAP(random_state=1, n_neighbors=50).fit_transform(normed_vectors)
plot_subdomain_embedding(subdomain_vectors_embedded_50nn)

Now we're looking at 50 neighbors instead of 15.

Observations

  • A lot of the same features still exist.
  • The right still has a couple distinct things going on
  • The left seems pretty spread out but also has a spike sticking out
  • There's still a small island of orange sites.
In [72]:
subdomain_vectors_embedded_allnn = umap.UMAP(random_state=1, n_neighbors=subdomain_vectors.shape[0]-1).fit_transform(normed_vectors)
plot_subdomain_embedding(subdomain_vectors_embedded_allnn)

Now we're looking at all neighbors.

Observations

  • Some of the local structure has broken down (as expected), but many of the major features identified above are still here
  • That tiny cluster that was off to the right is now an island in the middle.

To get a feel for what some of these features might represent in the parameter (and URL ideo space), I'm going to pull two samples from each topographic feature.

Large sites on the far right vs large right sites closer to center:

In [73]:
sites = ['foxnews.com', 'thefederalist.com',
        'breitbart.com', 'truepundit.com']
for site in sites:
    fit_site(df, site, show_plot=True)
subdomain_vectors.loc[sites,:]
Out[73]:
center_center center_height left_center left_height right_center right_height
foxnews.com 0.8 0.187760 -0.449842 0.020459 1.606871 1.226563
thefederalist.com 0.8 0.199354 -0.639858 0.015930 1.641796 1.239922
breitbart.com 0.8 0.048204 -1.084550 0.000000 1.637922 1.986324
truepundit.com 0.8 0.031763 -1.084550 0.000000 1.674306 1.954298

Observations

  • There are two differences:
    • The height of the center component. Fox News and The Federalist have fatter left tails that the others don't have.
    • Fox News and The Federalist also have non-zero left components, while the other two have left components that are small enough to get zeroed out.

That island in the center vs sites just off the island:

In [74]:
sites = ['thebiafratelegraph.co', 'lapatilla.com',
        'patreon.com', 'cato.org']
for site in sites:
    fit_site(df, site, show_plot=True)
subdomain_vectors.loc[sites,:]
/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/lmfit/minimizer.py:774: RuntimeWarning:

invalid value encountered in sqrt

/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/lmfit/minimizer.py:767: RuntimeWarning:

invalid value encountered in sqrt

Out[74]:
center_center center_height left_center left_height right_center right_height
thebiafratelegraph.co 0.522374 2.489767 -1.084550 0.000000 1.200000 0.048005
lapatilla.com 0.560943 1.620959 -1.084550 0.000000 1.514975 0.000000
patreon.com 0.594522 0.103311 -0.896033 1.051054 1.649065 0.438359
cato.org 0.536582 0.377477 -0.924770 0.224208 1.564579 0.156323

Observations

  • Looks like the sites in the island have a big center component and very small other components while that's not true of the others (or basically anything else).

Far end of the blue spike vs. near side of the blue spike:

In [75]:
sites = ['thinkprogress.org', 'actblue.com',
        'rawstory.com', 'newyorker.com']
for site in sites:
    fit_site(df, site, show_plot=True)
subdomain_vectors.loc[sites,:]
Out[75]:
center_center center_height left_center left_height right_center right_height
thinkprogress.org 3.702620e-32 0.019374 -1.057690 3.686879 1.514975 0.0
actblue.com 7.005425e-16 0.028070 -1.006814 3.716571 1.514975 0.0
rawstory.com 4.955805e-11 0.017125 -1.170463 1.934420 1.514975 0.0
newyorker.com 4.146758e-17 0.059876 -1.152475 1.937672 1.514975 0.0

Observations

  • The big diffentiator here is the height of the left component. "Height" is a bit difficult to interpret in this context, but it's related to variance. I think the big difference on the spike is variance of the left component increases as you move into the big cluster. For each site in the spike, the ideology of URLs cluster tightly.

Futzing with Animation

There are a number of parameters to the dimensionality reduction algorithm, and something that would be nice is seeing how the graph shifts as those parameters change. One nice way of seeing that is by animating the graph. Animating the graph is somewhat useful in this context, but will be very useful when we're looking at things longitudinally, so I feel OK spending some time on it.

The thing that's difficult about this is finding the balance between making the graph stable from frame the frame and making the distances between nodes as meaningful as possible. I've addressed this in four ways:

  1. All graphs are rotated so that the mean ideology of each site increases as you move along the x-axis (as much as possible).
  • Reflecting over the x-axis seems necessary. I'm not sure why yet.
  • The positions of nodes in the graph are initialized at their position in the previous frame. This doesn't constrain their movement - just starts them off near where they were instead of somewhere random.
  • Graphs are normalized so they all have zero mean and a standard deviation of one.

Some of this is taken from here.

It just renders the last frame here. The actual animations are here and here.

In [48]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcol
from matplotlib import animation
%matplotlib inline
import seaborn as sns
import itertools
sns.set(style='white', rc={'figure.figsize':(14, 12)})
In [105]:
def tween(e1, e2, n_frames=60):
    for i in range(20):
        yield e1
    for i in range(n_frames):
        alpha = i / float(n_frames - 1)
        yield (1 - alpha) * e1 + alpha * e2
    for i in range(20):
        yield(e2)
    return

def generate_frame_coords(data, height_cutoff=HEIGHT_CUTOFF, **kwargs):
    _, subdomain_vectors = model_results_to_subdomain_vectors(data, height_cutoff)
    subdomain_vectors_embedded = umap.UMAP(random_state=1, **kwargs).fit_transform(subdomain_vectors)
    return rotate_to_optimal_alignment(subdomain_vectors_embedded)
    
def generate_frame_data(data, arg_name='n_neighbors', arg_list=[3,4,5,7,9,11,15,20,25,30,40,60,100], **kwargs):
    result = []
    es = []
    for arg in arg_list:
        kwargs[arg_name] = arg
        if len(es) > 0:
            es.append(generate_frame_coords(data, init=es[-1], **kwargs))
        else:
            es.append(generate_frame_coords(data, **kwargs))
        
    for e1, e2 in zip(es[:-1], es[1:]):
        result.extend(list(tween(e1, e2)))
        
    return result

def create_animation(frame_data, arg_name='n_neighbors', arg_list=[3,4,5,7,9,11,15,20,25,30,40,60,100]):
    fig = plt.figure()
    all_data = np.vstack(frame_data)
    #pol_cm = mcol.LinearSegmentedColormap.from_list("Pol",["#3771f3", "#b147cc", "#d62222"])
    pol_cm = mcol.LinearSegmentedColormap.from_list("Pol",["#3771f3", "#cccccc", "#d62222"])
    frame_bounds = (all_data[:, 0].min() * 1.1, 
                    all_data[:, 0].max() * 1.1,
                    all_data[:, 1].min() * 1.1, 
                    all_data[:, 1].max() * 1.1)
    ax = plt.axes(xlim=(frame_bounds[0], frame_bounds[1]), 
                  ylim=(frame_bounds[2], frame_bounds[3]))
    points = plt.scatter(frame_data[0][:, 0], frame_data[0][:, 1], 
                        #s=5,
                        s=[max(10, s) for s in (subdomain_info['num_shares'] / 1000)],
                        c=subdomain_info['mean_url_ideo'], 
                        cmap=pol_cm,
                        alpha=0.8,
                        edgecolors='white',
                        animated=True)
    title = plt.title('', fontsize=24)
    ax.set_xticks([])
    ax.set_yticks([])
    #cbar = plt.colorbar(values=np.arange(10), boundaries=np.arange(11)-0.5, ticks=np.arange(10), drawedges=True)
    #cbar.ax.yaxis.set_ticklabels(np.arange(10), fontsize=18)

    def init():
        points.set_offsets(frame_data[0])
        title.set_text('UMAP with {}={}'.format(arg_name, arg_list[0]))
        return points,

    def animate(i):
        points.set_offsets(frame_data[i])
        if (i + 15) % 100 == 0:
            title.set_text('UMAP with {}={}'.format(arg_name, arg_list[(i + 15) // 100]))
        return points,

    anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(frame_data), interval=20, blit=True)
    anim.save('umap_anim-{}.mp4'.format(arg_name), writer='ffmpeg', fps=30)
    
def animate_param(data, arg_name='n_neighbors', arg_list=[3,4,5,7,9,11,15,20,25,30,40,60,100], **kwargs):
    frame_data = generate_frame_data(data, arg_name, arg_list, **kwargs)
    create_animation(frame_data, arg_name, arg_list)
In [104]:
%%time
#animate_param(model_outputs, 'n_neighbors', [3,15,100,954])
animate_param(model_outputs, 'n_neighbors', [3,4,5,7,9,11,15,30,60,100,200,300,400,500,600,700,800,954])
/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/umap/spectral.py:229: UserWarning:

Embedding a total of 15 separate connected components using meta-embedding (experimental)

CPU times: user 3min 13s, sys: 6.94 s, total: 3min 20s
Wall time: 2min 37s
In [106]:
%%time
#animate_param(model_outputs, 'height_cutoff', [0, 0.1])
animate_param(model_outputs, 'height_cutoff', [0, 0.0001, 0.001, 0.005, 0.007, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.5], n_neighbors=954)
CPU times: user 3min 45s, sys: 6.98 s, total: 3min 52s
Wall time: 2min 47s

To Do

  • I should put more thought into interpreting what the observations mean in terms of the research questions we're looking at.
  • Look for opportunities to use these parameter estimates.
  • Fit the same model to sharer ideology distributions.
  • Should the center be skewable? Look at reuters.com