In [3]:
import scipy.io
import numpy as np
#np.random.set_state(42)
import pandas as pd
import prince
import seaborn as sbs
In [6]:
%time matrix = scipy.io.mmread('/berkman/scratch/jclark/mc/barbera_replication/adj-matrix-20180705.mm.gz')
CPU times: user 5min 58s, sys: 1.5 s, total: 5min 59s
Wall time: 6min
In [7]:
matrix
Out[7]:
<10417522x528 sparse matrix of type '<class 'numpy.float64'>'
	with 65657822 stored elements in COOrdinate format>
In [8]:
%time df = pd.SparseDataFrame(matrix)
CPU times: user 6.69 s, sys: 4.56 s, total: 11.3 s
Wall time: 6.26 s
In [9]:
df = df.fillna(0)
In [10]:
row_sums = df.sum(axis=1)
In [11]:
highly_engaged = df[row_sums >= 10]
In [12]:
highly_engaged.shape
Out[12]:
(1241634, 528)
In [13]:
not_highly_engaged = df[(row_sums > 6) & (row_sums < 10)]
In [14]:
not_highly_engaged.shape
Out[14]:
(970488, 528)
In [15]:
ca = prince.CA(n_components=2)
%time ca = ca.fit(highly_engaged)
CPU times: user 3min 49s, sys: 53.8 s, total: 4min 43s
Wall time: 2min 33s
In [16]:
%time coords = ca.transform(not_highly_engaged)
CPU times: user 43.9 s, sys: 20.2 s, total: 1min 4s
Wall time: 43.2 s
In [17]:
coords = ca.row_coordinates(highly_engaged).append(coords).drop_duplicates()
In [27]:
_ = sbs.pairplot(coords, height=5, diag_kind='kde', aspect=1.8)
In [19]:
_ = sbs.distplot(coords[0], bins=5000)
In [24]:
_ = sbs.jointplot(coords[0], coords[1], kind='hex')
In [25]:
congress_coords = ca.column_coordinates(highly_engaged)
_ = sbs.jointplot(congress_coords[0], congress_coords[1])
/berkman/home/jclark/miniconda3/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval