based on https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageview_API
import pandas as pd
from scipy import stats
import numpy as np
import json
import requests
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
path = "/Users/mark/Dropbox/Math/Complex_Systems/research/wikipedia-network/paper/writeup/graphics/"
def get_page_views(article):
"""
sums daily page view timeseries for Oct 2015 using api
article title requires proper captilization
"""
url_start = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/"
url_end = "/daily/2015100100/2015103100"
#replace whitespace with underscores
article = article.replace (" ", "_")
url = url_start + article + url_end
page = requests.get(url)
views_dict = json.loads(page.content)
views = 0
try:
for daily_data in views_dict['items']:
views += daily_data['views']
except KeyError:
pass
return views
get_page_views("Philosophy")
#load traversal visits and funnels into dataframe (~ 2min runtime)
results_path = "/Users/mark/Desktop/wiki_v4/"
with open(results_path + "clicks.json") as f:
visits_dict = json.load(f)
visits_df = pd.DataFrame(visits_dict.items())
visits_df.columns = ['article', 'traversal visits']
visits_df = visits_df.sort_values(by='traversal visits', ascending=False)
with open(results_path + "feed_count.json") as f:
funnels_dict = json.load(f)
funnels_df = pd.DataFrame(funnels_dict.items())
funnels_df.columns = ['article', 'traversal funnels']
funnels_df = funnels_df.sort_values(by='traversal funnels', ascending=False)
# runtime ~15min
# run once (else load from file below)
pd.options.mode.chained_assignment = None # default='warn'
#compute views for articles with highest traversal visits
visits_top_df = visits_df[:1000]
visits_top_df['page views'] = visits_top_df['article'].map(lambda x: get_page_views(x))
#save dataframe to file
temp_store = "/Users/mark/Desktop/wiki_v4/"
visits_top_df.to_json(temp_store + "top_views_and_visits.json")
#read from file
temp_store = "/Users/mark/Desktop/wiki_v4/"
visits_top_df = pd.read_json(temp_store + "top_views_and_visits.json")
#describe dataframe, scatter plot visits versus views
visits_top_df.describe()
visits_top_df.sort_values(by='page views', ascending=False)
visits_top_df['traversal visits (in thousands)'] = visits_top_df['traversal visits'] / 1000.0
sns.jointplot(x="traversal visits (in thousands)", y="page views", data=visits_top_df, stat_func=None)
visits_top_df['log(traversal visits)'] = np.log10(visits_top_df['traversal visits'])
visits_top_df['log(page views)'] = np.log10(visits_top_df['page views'])
sns.set(font_scale=1.25)
g = sns.jointplot(visits_top_df["log(traversal visits)"], visits_top_df["log(page views)"], stat_func=None)
g.set_axis_labels("$\log_{10}$(traversal visits)", "$\log_{10}$(page views)", fontsize=14)
#save figure
plt.savefig(path+'views_visits.png', format='png', dpi=300, bbox_inches='tight')
sns.set(font_scale=1.25)
g = sns.jointplot(visits_top_df["log(traversal visits)"], visits_top_df["log(page views)"], stat_func=None)
g.annotate("United State", (5.809719, 6.144095)
g.set_axis_labels("$\log_{10}$(traversal visits)", "$\log_{10}$(page views)", fontsize=14)
g = sns.jointplot(x="log(traversal visits)", y="log(page views)", data=visits_top_df, kind="kde", color="g")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
sns.jointplot(x="log(traversal visits)", y="log(page views)", data=visits_top_df, kind="kde")
#specific points in log-space
visits_top_df.sort_values(by='log(page views)', ascending=False)
funnels_top_df
#runtime: ~15min
#alternate way to fetch views for top funnels (without sending too many server requests)
from time import sleep
#compute views for articles with highest traversal visits
funnels_top_df = funnels_df[:1000]
views_list = []
for i, a in enumerate(funnels_top_df['article']):
try:
views_list.append(get_page_views(a))
except:
try:
sleep(1)
views_list.append(get_page_views(a))
except Exception as e:
print i, a
print e
break
funnels_top_df['page views'] = views_list
#save to file
temp_store = "/Users/mark/Desktop/wiki_v4/"
funnels_top_df.to_json(temp_store + "top_views_and_funnels.json")
#read funnels and views from file
temp_store = "/Users/mark/Desktop/wiki_v4/"
funnels_top_df = pd.read_json(temp_store + "top_views_and_funnels.json")
funnels_top_df.describe()
funnels_top_df.sort_values(by='page views', ascending=False)
sns.jointplot(x="traversal funnels", y="page views", data=funnels_top_df, stat_func=None)
#save figure
plt.savefig(path+'funnels_visits.png', format='png', dpi=300, bbox_inches='tight')
Excluding Philosophy as an outlier:
sns.jointplot(x="traversal funnels", y="page views", data=funnels_top_df[funnels_top_df['article'] != "Philosophy"], stat_func=None)
plt.text(-3000, 500000, "* excludes Philosophy \n (with 7.37 million funnels)")
funnels_top_df['log(traversal funnels)'] = np.log10(funnels_top_df['traversal funnels']+1)
funnels_top_df['log(page views)'] = np.log10(funnels_top_df['page views']+1)
sns.set(font_scale=1.25)
g = sns.jointplot(funnels_top_df["log(traversal funnels)"], funnels_top_df["log(page views)"], stat_func=None)
g.set_axis_labels("$\log_{10}$(traversal funnels)", "$\log_{10}$(page views)", fontsize=14)
#save figure
plt.savefig(path+ 'views_funnels.png', format='png', dpi=300, bbox_inches='tight')
funnels_top_df.sort_values(by="log(page views)", ascending=False).head(10)
funnels_top_df.sort_values(by="log(traversal funnels)", ascending=False).head(10)
funnels_top_df.sort_values(by="log(page views)", ascending=True).head(10)