Updated Mar 7, 2019
Changes
The goal here is to replicate Pablo Barberá's work on estimating the political ideologies of Twitter accounts from their follower lists. The general approach is:
1 if following and 0 otherwiseAs a first pass, the "elite" space is made up entirely of deputes and senateurs. We've already collected the list of followers for each of these accounts.
%matplotlib inline
import os, logging
import pandas as pd
import numpy as np
import seaborn as sns
import prince, tqdm, scipy.sparse
sns.set()
with open('data/elite_screen_names.txt', 'r') as f:
elite_screen_names = [line.strip() for line in f]
len(elite_screen_names)
elite_to_followers = {}
for sn in tqdm.tqdm_notebook(elite_screen_names):
try:
with open(f'data/followers/{sn}.txt', 'r') as f:
elite_to_followers[sn] = [int(line) for line in f]
except FileNotFoundError:
logging.warning(f'Follower list not found for {sn}')
fmeunier19's and AlainVasselle's accounts are protected. Account e807Limon does not exist.
We want to make sure each of our elites has a minimum number of followers. This is so we can subset the users into two groups by engagement (to better estimate the ideological space) and make sure all the elites have followers in both groups. On the first pass of this, this became an issue for Twitter account DidierParis.
elite_to_num_followers = {e: len(f) for e, f in elite_to_followers.items()}
pd.Series(elite_to_num_followers).sort_values().head()
MIN_ELITE_FOLLOWERS = 6
for elite in list(elite_to_followers.keys()):
if elite_to_num_followers[elite] < MIN_ELITE_FOLLOWERS:
logging.warning(f'@{elite} has {elite_to_num_followers[elite]} followers and will be removed')
del elite_to_followers[elite]
%%time
# Canonical ordering of elites and followers
elites = list(set(elite_to_followers))
followers = list(set([follower for followers in elite_to_followers.values() for follower in followers]))
elite_to_i = {e: i for i, e in enumerate(elites)}
follower_to_i = {f: i for i, f in enumerate(followers)}
data, follower_idx, elite_idx = [], [], []
for e, fs in elite_to_followers.items():
for f in fs:
data.append(1)
elite_idx.append(elite_to_i[e])
follower_idx.append(follower_to_i[f])
follower_matrix = scipy.sparse.coo_matrix((data, (follower_idx, elite_idx)), dtype=np.int8)
follower_matrix = follower_matrix.tocsr()
follower_matrix.shape
follower_df = pd.DataFrame(follower_matrix.toarray()).rename(dict(enumerate(followers))).rename(dict(enumerate(elites)), axis=1)
num_followers = follower_df.sum()
num_followers.name = 'num_followers'
num_followers.sort_values(ascending=False).head(10)
num_followed = follower_df.sum(axis=1)
num_followed.name = 'num_followed'
num_followed.sort_values(ascending=False).head(10)
We're interested in the most engaged accounts, here measured by the number of political elites they follow. Let's see what that distribution looks like.
num_elite_followed = follower_df.sum(axis=1)
ax = pd.Series(num_elite_followed).value_counts().sort_values(ascending=False).head(20).plot.bar(logy=True)
_ = ax.set_ylabel('# Twitter accounts')
_ = ax.set_xlabel('# political elites followed')
Let's filter to accounts following 3 or more political elites. That will both trim down the data to a more managable size and hopefully also make the space better representative of ideology.
MIN_FOLLOWED_FOR_ENGAGED = 3
engaged_df = follower_df.reindex(follower_df.index[num_elite_followed >= MIN_FOLLOWED_FOR_ENGAGED])
engaged_df.shape
%%time
ca = prince.CA(n_components=3)
ca = ca.fit(engaged_df)
%%time
engaged_coords = ca.row_coordinates(engaged_df)
_ = sns.pairplot(engaged_coords, height=5, aspect=1.8, plot_kws={'alpha': 0.1}, diag_kws={'bins': 50})
engaged_coords[0].value_counts().sort_values(ascending=False).head()
def get_following(account_id):
return follower_df.loc[account_id,follower_df.loc[account_id,:] == 1].index.values
list(map(get_following, engaged_coords[engaged_coords[0].round(6) == -1.227023].sample(5).index.values))
list(map(get_following, engaged_coords[engaged_coords[0].round(6) == -1.125949].sample(5).index.values))
There are a lot of accounts with exactly the same score. This is because there are a lot of accounts with exactly the same set of accounts they're following. This isn't a problem per-se, but it can be a limitiation in follow-on analysis. (Sets of politicians that are being followed because they are popular or influential muddies the ideological space we're looking for.) This limitation can be worked around in two ways: raising the minimum number of followed elites to consider an account "engaged", or doing that plus adding more elite accounts in a principled way. We're investigating the second.
elite_coords = ca.column_coordinates(engaged_df)
elite_coords.sample(5)
_ = sns.pairplot(elite_coords.join(np.log(num_followers)), height=5, aspect=1.8,
plot_kws={'alpha': 0.3, 'size': np.log(num_followers), 'hue': np.log(num_followers)}, diag_kws={'bins': 50})
not_engaged_df = follower_df.reindex(follower_df.index[num_elite_followed < MIN_FOLLOWED_FOR_ENGAGED])
not_engaged_df.shape
%%time
not_engaged_coords = ca.transform(not_engaged_df)
not_engaged_coords.shape
_ = sns.pairplot(not_engaged_coords, height=5, aspect=1.8, plot_kws={'alpha': 0.3}, diag_kws={'bins': 50})
Looking at the graphs above, the number of followers still seems like it plays a large role in determining where elites fall in the space. I think some of this might be coming in from the heavily repeated number of unique sets of followers. For example, the collection of ['manuelvalls', 'jlmelenchon', 'MLP_officiel'] is followed 157K times. I'd guess the prominence of those accounts is more explanatory of their co-occurence than ideological similarity is. As a first pass at compensating for this, I'm only going to look at unique follow-sets of elites, i.e. ['manuelvalls', 'jlmelenchon', 'MLP_officiel'] is only going to count once instead of 157K times.
%%time
unique_follower_sets_df = engaged_df.drop_duplicates()
unique_follower_sets_df.shape
%%time
ca_unique = prince.CA(n_components=3)
ca_unique = ca_unique.fit(unique_follower_sets_df)
%%time
unique_sets_coords = ca_unique.row_coordinates(unique_follower_sets_df)
_ = sns.pairplot(unique_sets_coords, height=5, aspect=1.8, plot_kws={'alpha': 0.1}, diag_kws={'bins': 50})
unique_sets_coords[0].value_counts().sort_values(ascending=False).head()
elite_unique_sets_coords = ca_unique.column_coordinates(unique_follower_sets_df)
elite_unique_sets_coords.sample(5)
_ = sns.pairplot(
elite_unique_sets_coords.join(np.log(num_followers)),
height=5,
aspect=1.8,
plot_kws={'alpha': 0.5, 'size': np.log(num_followers), 'hue': np.log(num_followers)},
diag_kws={'bins': 50})
elite_unique_sets_coords.describe()
elite_coords.to_csv('data/political_elite_account_coordinates.csv.gz')
engaged_coords.to_csv('data/engaged_account_coordinates.csv.gz')
not_engaged_coords.to_csv('data/not_engaged_account_coordinates.csv.gz')
elite_unique_sets_coords.to_csv('data/political_elite_account_unique_follower_sets_coordinates.csv.gz')
unique_sets_coords.to_csv('data/engaged_account_unique_follower_sets_oordinates.csv.gz')