Comparing Media Ideology Estimates

Updated Feb 6, 2019

This is an unfinished notebook that contains code for comparing various media source ideology scores.

In [1]:
import pandas as pd
import seaborn as sb
import numpy as np

import url_tools
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-1-fabe559ab621> in <module>
      3 import numpy as np
      4 
----> 5 import url_tools

ModuleNotFoundError: No module named 'url_tools'
In [12]:
mc_retweet = pd.read_csv('ideo_estimates/mediacloud_media_ideology_from_election_retweeters_20161107.csv')
mc_retweet['subdomain'] = mc_retweet['url'].apply(lambda u: url_tools.domain_with_subdomains(u))
mc_retweet = mc_retweet.drop('url', axis=1).set_index('subdomain')
mc_retweet['subdomain'] = mc_retweet.index
mc_retweet = mc_retweet.set_index('subdomain').drop('name', axis=1).rename({'score': 'ppd_ideo'}, axis=1)
mc_retweet.sample(5)
mc_retweet.to_csv('/home/jclark/Desktop/benkler_pdd_media_source_ideo.csv')
In [10]:
mc_retweet = pd.read_csv('ideo_estimates/mediacloud_media_ideology_from_election_retweeters_20161107.csv').drop(['partition', 'media_id'], axis=1)
mc_retweet['cleaned_domain'] = mc_retweet['url'].apply(lambda u: url_tools.domain_with_subdomains(u))
mc_retweet = mc_retweet.drop('url', axis=1).set_index('cleaned_domain')
mc_retweet['cleaned_domain'] = mc_retweet.index
name_to_domain = mc_retweet.set_index('name').drop('score', axis=1)
mc_retweet = mc_retweet.set_index('cleaned_domain').drop('name', axis=1).rename({'score': 'mc_retweet'}, axis=1)
mc_retweet.sample(5)
Out[10]:
media_id ppd_ideo_score partition
subdomain
rsbn.tv 278246 0.972456 5
brookings.edu 19115 -0.837545 1
nationalinterest.org 25488 0.314017 4
chicagotribune.com 9 -0.274454 2
liberalamerica.org 147130 -0.857859 1
In [60]:
aaron_text = pd.read_csv('ideo_estimates/aaron_media_ideology_from_text_20181931.csv')
aaron_text = aaron_text.join(name_to_domain, on='media')
aaron_text['cleaned_domain'].fillna('', inplace=True)
aaron_text['new_cleaned_domain'] = aaron_text.apply(
    lambda r: r['cleaned_domain'] if r['cleaned_domain'] != '' else url_tools.domain_with_subdomains(r['media']), axis=1)
aaron_text = aaron_text[aaron_text['new_cleaned_domain'] != ''].set_index('new_cleaned_domain').drop_duplicates()\
            .drop(['Unnamed: 0', 'media', 'cleaned_domain'], axis=1).rename({'idea': 'aaron_text'}, axis=1)
aaron_text.sample(5)
Out[60]:
aaron_text
new_cleaned_domain
humantrafficking.change.org 0.075772
fusion.net 0.112765
plus.google.com -0.155597
bustle.com 0.166565
rewire.news 0.035247
In [67]:
facebook = pd.read_csv('ideo_estimates/facebook_media_ideology_from_facebook_shares_20150107.csv')
facebook['cleaned_domain'] = facebook['domain'].apply(lambda u: url_tools.domain_with_subdomains(u))
#df = df.join(facebook.set_index('cleaned_domain').loc[:,['avg_align']], how='outer').rename({'avg_align': 'facebook'}, axis=1)
facebook = facebook.set_index('cleaned_domain').loc[:,['avg_align']].rename({'avg_align': 'bakshy_facebook'}, axis=1)
facebook.sample(5)
Out[67]:
bakshy_facebook
cleaned_domain
cnsnews.com 0.8994
fox10phoenix.com 0.2386
c-span.org -0.0442
npr.org -0.6103
usatoday.com -0.0635
In [76]:
mbfc = pd.read_csv('ideo_estimates/mediabiasfactcheck_media_ideology_from_manual_review_20170610.csv')
mbfc['cleaned_domain'] = mbfc['domain'].apply(lambda u: url_tools.domain_with_subdomains(u))
mbfc = pd.DataFrame(mbfc.set_index('cleaned_domain')['mediabiasfactcheck'].dropna().map({
    'left': -1.0,
    'left_center': -0.5,
    'least_biased': 0.0,
    'right_center': 0.5,
    'right': 1.0}).dropna())
mbfc.sample(5)
#df = df.join(mbfc, how='outer')
#df.sample(5)
Out[76]:
mediabiasfactcheck
cleaned_domain
thelocal.no -0.5
democratandchronicle.com -0.5
soas.ac.uk 0.0
unbiasedamerica.com 1.0
vox.com -1.0
In [77]:
barbera = pd.read_csv('ideo_estimates/usandbarbera_media_ideology_from_twitter_followers_20180705.csv').dropna(subset=['domain'])
barbera['cleaned_domain'] = barbera['domain'].apply(lambda u: url_tools.domain_with_subdomains(u))
barbera = pd.DataFrame(barbera.set_index('cleaned_domain').loc[:,'mean_ideology'].rename('justin_barbera'))
barbera.sample(5)

#df = df.join(barbera.set_index('cleaned_domain').loc[:,['mean_ideology', 'num_uniq_users']], how='outer').rename({'mean_ideology': 'barbera', 'num_uniq_users': 'num_twitter_sharers'}, axis=1)
Out[77]:
justin_barbera
cleaned_domain
theregister.co.uk 0.025345
wiscnews.com -0.359697
wesearchr.com 1.728245
politicshome.com -0.013283
actionnetwork.org -0.610554
In [86]:
joined = mc_retweet.join([aaron_text, facebook, mbfc, barbera], how='outer')
joined.drop('aaron_text', axis=1).to_csv('media_domain_ideology_estimates.csv')
joined.sample(5)
Out[86]:
mc_retweet aaron_text bakshy_facebook mediabiasfactcheck justin_barbera
gruntstyle.com NaN NaN NaN NaN 1.817092
syriacivildefense.org NaN -0.087706 NaN NaN NaN
artvoice.com NaN NaN NaN NaN 1.594961
panthers.com NaN NaN NaN NaN 0.418425
twimg.com -0.492606 0.010689 NaN NaN 0.563475
In [78]:
_ = sb.pairplot(joined)
/home/jclark/miniconda3/envs/mediacloud/lib/python3.6/site-packages/numpy/lib/histograms.py:746: RuntimeWarning: invalid value encountered in greater_equal
  keep = (tmp_a >= first_edge)
/home/jclark/miniconda3/envs/mediacloud/lib/python3.6/site-packages/numpy/lib/histograms.py:747: RuntimeWarning: invalid value encountered in less_equal
  keep &= (tmp_a <= last_edge)
In [79]:
_ = sb.pairplot(joined.rank())
In [81]:
news_media_domains = [
    'alarabiya.net', 'aljazeera.com', 'americanthinker.com', 'bbc.com',
    'bbc.co.uk', 'bloomberg.com', 'bostonglobe.com', 'breitbart.com',
    'buzzfeed.com', 'cbc.ca', 'cbsnews.com', 'chicagotribune.com', 'cnbc.com',
    'cnn.com', 'csmonitor.com', 'dailycaller.com', 'dailykos.com',
    'dailymail.co.uk', 'economist.com', 'forbes.com', 'foreignpolicy.com',
    'fortune.com', 'foxnews.com', 'haaretz.com', 'hindustantimes.com',
    'huffingtonpost.com', 'huffpost.com', 'independent.co.uk', 'infowars.com',
    'latimes.com', 'miamiherald.com', 'motherjones.com', 'msnbc.com',
    'nationalreview.com', 'nbcnews.com', 'newsweek.com', 'newyorker.com',
    'npr.org', 'nydailynews.com', 'nypost.com', 'nytimes.com', 'pbs.org',
    'politico.com', 'propublica.org', 'realclearpolitics.com','reuters.com',
    'rollcall.com', 'rt.com', 'salon.com', 'sky.com', 'slate.com',
    'sputniknews.com', 'theatlantic.com', 'theguardian.com', 'thehill.com',
    'time.com', 'usatoday.com', 'vox.com', 'washingtonpost.com',
    'washingtontimes.com', 'weeklystandard.com', 'westernjournal.com', 'wsj.com',
    'zerohedge.com',
]
non_news_domains = [
    'aclu.org', 'change.org', 'cosmopolitan.com', 'facebook.com', 'google.com',
    'harvard.edu', 'hbr.org', 'mit.edu', 'patreon.com', 'politifact.com',
    'reddit.com', 'reuters.com', 'twitter.com', 'wikileaks.org', 'youtube.com',
]
domains = news_media_domains + non_news_domains
joined.loc[domains,:].drop_duplicates().rank().plot.barh(figsize=(10, 38))
#_ = sb.pairplot(df.loc[domains,:].rank(), vars=['aaron', 'mc', 'facebook', 'mediabiasfactcheck', 'barbera'])
#aaron_text[aaron_text['new_cleaned_domain'] == 'huffingtonpost.com']
#_ = aaron_text.set_index('new_cleaned_domain').loc[domains,'idea'].dropna().drop_duplicates().sort_values(ascending=False).plot.barh(figsize=(10, 18), legend=False)
Out[81]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3c7f74b6d8>