import scipy.io
import numpy as np
#np.random.set_state(42)
import pandas as pd
import prince
import seaborn as sbs
%time matrix = scipy.io.mmread('/berkman/scratch/jclark/mc/barbera_replication/adj-matrix-20180705.mm.gz')
matrix
%time df = pd.SparseDataFrame(matrix)
df = df.fillna(0)
row_sums = df.sum(axis=1)
highly_engaged = df[row_sums >= 10]
highly_engaged.shape
not_highly_engaged = df[(row_sums > 6) & (row_sums < 10)]
not_highly_engaged.shape
ca = prince.CA(n_components=2)
%time ca = ca.fit(highly_engaged)
%time coords = ca.transform(not_highly_engaged)
coords = ca.row_coordinates(highly_engaged).append(coords).drop_duplicates()
_ = sbs.pairplot(coords, height=5, diag_kind='kde', aspect=1.8)
_ = sbs.distplot(coords[0], bins=5000)
_ = sbs.jointplot(coords[0], coords[1], kind='hex')
congress_coords = ca.column_coordinates(highly_engaged)
_ = sbs.jointplot(congress_coords[0], congress_coords[1])