Updated Feb 6, 2019
This is an unfinished notebook that contains code for comparing various media source ideology scores.
import pandas as pd
import seaborn as sb
import numpy as np
import url_tools
mc_retweet = pd.read_csv('ideo_estimates/mediacloud_media_ideology_from_election_retweeters_20161107.csv')
mc_retweet['subdomain'] = mc_retweet['url'].apply(lambda u: url_tools.domain_with_subdomains(u))
mc_retweet = mc_retweet.drop('url', axis=1).set_index('subdomain')
mc_retweet['subdomain'] = mc_retweet.index
mc_retweet = mc_retweet.set_index('subdomain').drop('name', axis=1).rename({'score': 'ppd_ideo'}, axis=1)
mc_retweet.sample(5)
mc_retweet.to_csv('/home/jclark/Desktop/benkler_pdd_media_source_ideo.csv')
mc_retweet = pd.read_csv('ideo_estimates/mediacloud_media_ideology_from_election_retweeters_20161107.csv').drop(['partition', 'media_id'], axis=1)
mc_retweet['cleaned_domain'] = mc_retweet['url'].apply(lambda u: url_tools.domain_with_subdomains(u))
mc_retweet = mc_retweet.drop('url', axis=1).set_index('cleaned_domain')
mc_retweet['cleaned_domain'] = mc_retweet.index
name_to_domain = mc_retweet.set_index('name').drop('score', axis=1)
mc_retweet = mc_retweet.set_index('cleaned_domain').drop('name', axis=1).rename({'score': 'mc_retweet'}, axis=1)
mc_retweet.sample(5)
aaron_text = pd.read_csv('ideo_estimates/aaron_media_ideology_from_text_20181931.csv')
aaron_text = aaron_text.join(name_to_domain, on='media')
aaron_text['cleaned_domain'].fillna('', inplace=True)
aaron_text['new_cleaned_domain'] = aaron_text.apply(
lambda r: r['cleaned_domain'] if r['cleaned_domain'] != '' else url_tools.domain_with_subdomains(r['media']), axis=1)
aaron_text = aaron_text[aaron_text['new_cleaned_domain'] != ''].set_index('new_cleaned_domain').drop_duplicates()\
.drop(['Unnamed: 0', 'media', 'cleaned_domain'], axis=1).rename({'idea': 'aaron_text'}, axis=1)
aaron_text.sample(5)
facebook = pd.read_csv('ideo_estimates/facebook_media_ideology_from_facebook_shares_20150107.csv')
facebook['cleaned_domain'] = facebook['domain'].apply(lambda u: url_tools.domain_with_subdomains(u))
#df = df.join(facebook.set_index('cleaned_domain').loc[:,['avg_align']], how='outer').rename({'avg_align': 'facebook'}, axis=1)
facebook = facebook.set_index('cleaned_domain').loc[:,['avg_align']].rename({'avg_align': 'bakshy_facebook'}, axis=1)
facebook.sample(5)
mbfc = pd.read_csv('ideo_estimates/mediabiasfactcheck_media_ideology_from_manual_review_20170610.csv')
mbfc['cleaned_domain'] = mbfc['domain'].apply(lambda u: url_tools.domain_with_subdomains(u))
mbfc = pd.DataFrame(mbfc.set_index('cleaned_domain')['mediabiasfactcheck'].dropna().map({
'left': -1.0,
'left_center': -0.5,
'least_biased': 0.0,
'right_center': 0.5,
'right': 1.0}).dropna())
mbfc.sample(5)
#df = df.join(mbfc, how='outer')
#df.sample(5)
barbera = pd.read_csv('ideo_estimates/usandbarbera_media_ideology_from_twitter_followers_20180705.csv').dropna(subset=['domain'])
barbera['cleaned_domain'] = barbera['domain'].apply(lambda u: url_tools.domain_with_subdomains(u))
barbera = pd.DataFrame(barbera.set_index('cleaned_domain').loc[:,'mean_ideology'].rename('justin_barbera'))
barbera.sample(5)
#df = df.join(barbera.set_index('cleaned_domain').loc[:,['mean_ideology', 'num_uniq_users']], how='outer').rename({'mean_ideology': 'barbera', 'num_uniq_users': 'num_twitter_sharers'}, axis=1)
joined = mc_retweet.join([aaron_text, facebook, mbfc, barbera], how='outer')
joined.drop('aaron_text', axis=1).to_csv('media_domain_ideology_estimates.csv')
joined.sample(5)
_ = sb.pairplot(joined)
_ = sb.pairplot(joined.rank())
news_media_domains = [
'alarabiya.net', 'aljazeera.com', 'americanthinker.com', 'bbc.com',
'bbc.co.uk', 'bloomberg.com', 'bostonglobe.com', 'breitbart.com',
'buzzfeed.com', 'cbc.ca', 'cbsnews.com', 'chicagotribune.com', 'cnbc.com',
'cnn.com', 'csmonitor.com', 'dailycaller.com', 'dailykos.com',
'dailymail.co.uk', 'economist.com', 'forbes.com', 'foreignpolicy.com',
'fortune.com', 'foxnews.com', 'haaretz.com', 'hindustantimes.com',
'huffingtonpost.com', 'huffpost.com', 'independent.co.uk', 'infowars.com',
'latimes.com', 'miamiherald.com', 'motherjones.com', 'msnbc.com',
'nationalreview.com', 'nbcnews.com', 'newsweek.com', 'newyorker.com',
'npr.org', 'nydailynews.com', 'nypost.com', 'nytimes.com', 'pbs.org',
'politico.com', 'propublica.org', 'realclearpolitics.com','reuters.com',
'rollcall.com', 'rt.com', 'salon.com', 'sky.com', 'slate.com',
'sputniknews.com', 'theatlantic.com', 'theguardian.com', 'thehill.com',
'time.com', 'usatoday.com', 'vox.com', 'washingtonpost.com',
'washingtontimes.com', 'weeklystandard.com', 'westernjournal.com', 'wsj.com',
'zerohedge.com',
]
non_news_domains = [
'aclu.org', 'change.org', 'cosmopolitan.com', 'facebook.com', 'google.com',
'harvard.edu', 'hbr.org', 'mit.edu', 'patreon.com', 'politifact.com',
'reddit.com', 'reuters.com', 'twitter.com', 'wikileaks.org', 'youtube.com',
]
domains = news_media_domains + non_news_domains
joined.loc[domains,:].drop_duplicates().rank().plot.barh(figsize=(10, 38))
#_ = sb.pairplot(df.loc[domains,:].rank(), vars=['aaron', 'mc', 'facebook', 'mediabiasfactcheck', 'barbera'])
#aaron_text[aaron_text['new_cleaned_domain'] == 'huffingtonpost.com']
#_ = aaron_text.set_index('new_cleaned_domain').loc[domains,'idea'].dropna().drop_duplicates().sort_values(ascending=False).plot.barh(figsize=(10, 18), legend=False)