metric: traversal funnels
the number of paths an article directs towards a cycle or invalid link
from collections import defaultdict
import pandas as pd
from scipy import stats
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
results_path = "/Users/mark/Desktop/wiki_v4/"
path = "/Users/mark/Dropbox/Math/Complex_Systems/research/wikipedia-network/paper/writeup/graphics/"
#load feeder data
with open(results_path + "feed_count.json") as f:
feeder_dict = json.load(f)
feeder_df = pd.DataFrame(feeder_dict.items())
feeder_df.columns = ['article', 'traversal funnels']
feeder_df = feeder_df.sort_values(by='traversal funnels', ascending=False)
feeder_df.head(50)
feeder_df.tail(20)
feeder_df.describe()
feeder_df[feeder_df['traversal funnels'] > 0].count()
sns.boxplot(x='traversal funnels', data=feeder_df[feeder_df['traversal funnels'] > 0])
extreme outlier is philosophy
feeder_df['rank'] = np.arange(1, feeder_df.shape[0]+1)
feeder_df['log(rank)'] = np.log10(feeder_df['rank'])
feeder_df['log(traversal funnels)']= np.log10(feeder_df['traversal funnels']+1)
feeder_df.head(50).iloc[::-1].plot(x="article", y="log(traversal funnels)", kind="barh", fontsize=14,
legend=False, figsize=(6,16), color="#268bd2")
#no background
ax = plt.gca()
ax.patch.set_visible(False)
plt.xlabel("$\log_{10}$(Traversal Visis)", fontsize=14)
plt.ylabel("")
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.tick_params(axis='x', which='major', labelsize=14)
#save figure
plt.savefig(path+'top_funnels.png', format='png', dpi=300, bbox_inches='tight')
plt.scatter(feeder_df['log(rank)'], feeder_df['log(traversal funnels)'], color="#87CEFA")
plt.title("Distribution of Traversal Funnels")
plt.xlabel("log(rank)")
plt.ylabel("log(traversal funnels)")
plt.legend()
feeder_df[feeder_df["log(rank)"] < 4]
slope, intercept, r_value, p_value, std_err = stats.linregress(feeder_df[:8103]["log(rank)"],
feeder_df[:8103]["log(traversal funnels)"])
print slope, intercept, r_value, p_value, std_err
plt.scatter(feeder_df['log(rank)'][:8103], feeder_df['log(traversal funnels)'][:8103], color="#87CEFA", label='r = -0.99')
plt.title("Traversal Funnels Top Regime (log(rank) < 9)")
plt.xlabel("log(rank)")
plt.ylabel("log(traversal funnels)")
plt.legend()
#defaults
sns.set()
plt.figure(figsize=(8,6))
plt.scatter(feeder_df['log(rank)'], feeder_df['log(traversal funnels)'], color="#87CEFA")
plt.xlabel("$\log_{10}$(rank)", fontsize=14)
plt.ylabel("$\log_{10}$(traversal funnels)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14)
plt.legend()
ax = plt.gca()
ax.legend().set_visible(False)
#define plot axis limits
axes = plt.gca()
xticks = axes.xaxis.get_major_ticks()
xticks[0].label1.set_visible(False)
yticks = axes.yaxis.get_major_ticks()
yticks[0].label1.set_visible(False)
sns.set_style("dark")
#subplot in top corner
a = plt.axes([.50, .50, .38, .34], axisbg='y')
a.scatter(feeder_df['log(rank)'][:8103], feeder_df['log(traversal funnels)'][:8103], color="#F08080",
label=r"$\alpha$ = -1.08"+"\n$\gamma$ = 0.07\n"+"Pearson\'s r = -0.99")
plt.legend(fontsize=14)
plt.title("Top Regime ($\log_{10}$(rank) < 4)", fontsize=14)
plt.xlabel("$\log_{10}$(rank)", fontsize=14)
plt.ylabel("$\log_{10}$(traversal funnels)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14)
#transparent
a.patch.set_alpha(0.1)
#define plot axis limits
axes = plt.gca()
xticks = axes.xaxis.get_major_ticks()
xticks[0].label1.set_visible(False)
yticks = axes.yaxis.get_major_ticks()
yticks[0].label1.set_visible(False)
#save figure
plt.savefig(path+'funnels_distribution.png', format='png', dpi=300, bbox_inches='tight')
#back to defaults
sns.set()