import pickle
import tldextract, joypy
import pandas as pd
import seaborn as sns
%matplotlib inline
sns.set()
Let's take a look at what NewsGuard has for data. A CSV of NewsGuard data joined with Facebook ideology scores and Media Cloud election retweeter scores is available here.
They didn't have any policies I could find against scraping data, so I wrote a quick scraper that tried to be nice about it. I couldn't find a way to get them to list the sites they had rated, so I put together a list of ~22,700 unique domains we had seen in some of our research. I then queried their system for each of those domains and stored all the data.
Their data comes back as a fairly detailed, multi-level JSON document. Here's an example document:
ratings = pickle.load(open('scrape_news_domains_20180827T14:08:00Z-0400.pkl', 'rb'))
ratings['nytimes.com']
Observations
createdDate and updatedDate are empty.criteria, metadata, rank, and score.orientation metadata field.criteria field contains the attributes they judge onrank field contains their judgement.I looked at sample pages for each unique rank code. Here are the messages a user receives for each rank:
rank_info = {
'TK': 'This website is still in the process of being rated by NewsGuard.',
'T': 'This website generally maintains basic standards of accuracy and accountability.',
'N': 'Proceed with caution: This website generally fails to maintain basic standards of accuracy and accountability.',
'P': 'This website publishes content from its users that it does not vet. Information from this source may not be reliable.',
'S': 'This is a satire or humor website. It is not an actual news source.'
}
They obviously didn't return ratings for all the domains I queried. Let's see how many ratings I actually retrieved, not including ratings of rank TK.
have_ratings = [d for d,r in ratings.items() if r['id'] is not None and r['rank'] != 'TK']
len(have_ratings)
site_to_id = {s:r['id'] for s,r in ratings.items() if r['id'] is not None}
ratings_by_id = {r['id']:r for r in ratings.values() if r['id'] is not None}
domain_to_rating = {tldextract.extract(s).registered_domain.lower():ratings_by_id[i] for s,i in site_to_id.items()}
def get_subkey(rating, key, title):
if rating[key] is None:
return None
metadata = next((m for m in rating[key] if m['title'] == title), None)
return metadata['body'] if metadata else None
def get_metadata(rating, title):
return get_subkey(rating, 'metadata', title)
def get_criteria(rating, title):
return get_subkey(rating, 'criteria', title)
The orientation data seems interesting. Let's see what that looks like.
orientations = {}
for domain, rating in domain_to_rating.items():
orientations[domain] = get_metadata(rating, 'orientation')
ng_orientations = pd.Series(orientations)
ng_orientations.name = 'ng_orientation'
ng_orientations.value_counts()
ng_orientations.groupby(ng_orientations).apply(lambda g: g.sample(5))
Observations
N/AFar Left.How do these orientations correlate with election retweeters and Facebook ideologies?
mc_scores = pd.read_csv('election_retweeter_polarization_media_scores.csv')
mc_scores['domain'] = mc_scores['url'].apply(lambda u: tldextract.extract(u).registered_domain.lower())
mc_scores = mc_scores.set_index('domain')
orientation_order = ['Far Left', 'Slightly Left', 'N/A', 'Slightly Right', 'Far Right']
joined = mc_scores.join(ng_orientations).dropna()
_ = sns.catplot(x='score', y='ng_orientation',
order=orientation_order, data=joined)
Observations
N/A has a wide range of scores.How about the Facebook data?
facebook = pd.read_csv('facebook_ideology_estimates.csv')
facebook['domain'] = facebook['domain'].apply(lambda d: tldextract.extract(d).registered_domain.lower())
facebook = facebook.set_index('domain')
_ = sns.catplot(x='avg_align', y='ng_orientation',
order=['Far Left', 'Slightly Left', 'N/A', 'Slightly Right', 'Far Right'],
data=facebook.join(ng_orientations).dropna())
Observations
N/A skews left.Slightly Left and Slightly Right look to extend about as far to the edges as their "Far" counterparts, but just have more variance.Let's build a flat dataset of interesting fields so things are a bit easier to look at. Then let's look at rank, score, and criteria.
Criteria is documented here.
domains = {}
for domain, rating in domain_to_rating.items():
if rating['id'] is None or rating['rank'] == 'TK':
continue
domains[domain] = {
'rank': rating['rank'],
'orientation': get_metadata(rating, 'orientation'),
'score': rating['score'],
'falseContent': get_criteria(rating, 'falseContent'),
'basicStandards': get_criteria(rating, 'basicStandards'),
'newsOpinion': get_criteria(rating, 'newsOpinion'),
'deceptiveHeadlines': get_criteria(rating, 'deceptiveHeadlines'),
'accountabilityPractices': get_criteria(rating, 'accountabilityPractices'),
'ownership': get_criteria(rating, 'ownership'),
'labelsAdvertising': get_criteria(rating, 'labelsAdvertising'),
'management': get_criteria(rating, 'management'),
'contentCreators': get_criteria(rating, 'contentCreators'),
'active': rating['active'],
}
ng = pd.DataFrame.from_dict(domains, orient='index')
print(ng.shape)
ng['rank'].value_counts()
ng[ng['rank'] == 'N']['score'].sort_values()
_ = sns.distplot(ng['score'], kde=False)
Observations
criteria = [
"falseContent",
"basicStandards",
"newsOpinion",
"accountabilityPractices",
"deceptiveHeadlines",
"ownership",
"labelsAdvertising",
"management",
"contentCreators",
]
ng.loc[:,criteria].apply(lambda c: c.value_counts())
_ = ng.loc[:,criteria].apply(lambda c: c.value_counts()).loc['No',].sort_values().plot.barh(title='# Sites Failing each Criteria')
Observations
I'll look at scores by NewsGuard assigned orientation first, and then switch over to the Facebook numbers to get something more fine-grained.
ng = ng[(ng['rank'] == 'T') | (ng['rank'] == 'N')]
ng.groupby('orientation')['rank'].value_counts()
_ = sns.stripplot(x='score', y='orientation', data=ng, order=orientation_order)
_ = sns.boxplot(x='score', y='orientation', data=ng, order=orientation_order)
Observations
Slightly Left has the fewest outliers with low scores.Far Right has more of its mass in the lower scores. In fact, the median is actually failing.Let's look at Facebook numbers.
import plotly.offline as plotly
import plotly.graph_objs as go
plotly.init_notebook_mode()
scatter1 = go.Scattergl(
y=ng.join(facebook)['score'],
x=ng.join(facebook)['avg_align'],
mode='markers',
text=ng.join(facebook).index,
marker=dict(
color=ng.join(facebook)['rank'].factorize()[0],
colorscale=[[0, 'rgb(200,20,20)'], [1, '#398937']]
)
)
layout1 = go.Layout(
title ='NewsGuard Scores vs Source Ideology',
hovermode = 'closest',
xaxis = dict(title = 'Ideology by Facebook Average Alignment'),
yaxis = dict(title = 'NewsGuard Score'),
)
plotly.iplot(go.Figure(data=[scatter1], layout=layout1))
Observations
ng.join(facebook).join(mc_scores, rsuffix='_mc').to_csv('newsguard_facebook-ideo_election-retweet_joined.csv')