measured as the accumulated number of visits after traversing the network:
import pandas as pd
from scipy import stats
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
path = "/Users/mark/Dropbox/Math/Complex_Systems/research/wikipedia-network/paper/writeup/graphics/"
#load results into dataframe (~ 1min runtime)
results_path = "/Users/mark/Desktop/wiki_v4/"
with open(results_path + "clicks.json") as f:
dict = json.load(f)
df = pd.DataFrame(dict.items())
df.columns = ['article', 'traversal visits']
df = df.sort_values(by='traversal visits', ascending=False)
#add a rank column and log scaled columns
df['log(traversal visits)'] = np.log10(df['traversal visits']+1)
df['rank'] = np.arange(1, df.shape[0]+1)
df['log(rank)'] = np.log10(df['rank'])
df['traversal visits (m)'] = df['traversal visits'] / 1000000
df.head(50)
df.head(50).iloc[::-1].plot(x="article", y="traversal visits (m)", kind="barh", fontsize=14,
legend=False, figsize=(6,16), color="#268bd2")
#no background
ax = plt.gca()
ax.patch.set_visible(False)
plt.xlabel("Traversal Visits \n (in millions)", fontsize=14)
plt.ylabel("")
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.tick_params(axis='x', which='major', labelsize=14)
#save figure
plt.savefig(path+'articles_ranked.png', format='png', dpi=300, bbox_inches='tight')
df.count()
df[df['traversal visits'] == 0].count()
df[df['traversal visits'] < 100].count()
df[df['traversal visits'] > 100].count()
plt.figure(figsize=(8,6))
visits_plot1 = df['traversal visits (m)'].head(10000).plot(kind='hist', title="Top 10,000 Articles")
visits_plot1.set_xlabel("traversal visits (millions)")
visits_plot1.set_ylabel("frequency")
plt.figure(figsize=(8,6))
visits_plot2 = df['traversal visits (m)'].head(1000).plot(kind='hist', title="Top 1,000 Articles")
visits_plot2.set_xlabel("traversal visits (millions)")
visits_plot2.set_ylabel("frequency")
#save figure
plt.savefig(path+'top_1k_article_traversals.png', format='png', dpi=300, bbox_inches='tight')
plt.figure(figsize=(8,6))
visits_plot3 = df['traversal visits (m)'].head(100).plot(kind='hist', title="Top 100 Articles")
visits_plot3.set_xlabel("traversal visits (millions)")
visits_plot3.set_ylabel("frequency")
#save figure
plt.savefig(path+'top_100_article_traversals.png', format='png', dpi=300, bbox_inches='tight')
plt.figure(figsize=(8,6))
sns.violinplot(y='traversal visits', data=df)
#log traversal visits
plt.figure(figsize=(8,6))
sns.violinplot(y='log(traversal visits)', data=df)
#traversal visists versus rank
plt.figure(figsize=(8,6))
plt.scatter(df["rank"], df["traversal visits"], color="#6495ED")
plt.title("Traversal Visits per Article")
plt.ylabel("Number of Traversal Visits")
plt.xlabel("Article Rank (by traversal visits)")
#linear fit of log(rank) vs. log(traveral visits)
slope, intercept, r_value, p_value, std_err = stats.linregress(df["log(rank)"], df["log(traversal visits)"])
print slope, intercept, r_value, p_value, std_err
plt.figure(figsize=(8,6))
plt.scatter(df["log(rank)"], df["log(traversal visits)"], color="#F08080", label="r-value = -0.93")
plt.title("Traversal Visits Distribution")
plt.xlabel("log(Article Rank)")
plt.ylabel("log(Traversal Visits)")
plt.legend()
plt.scatter(df["log(rank)"], df["log(traversal visits)"], color="#F08080",
label=r'$\alpha$ = -0.64'+"\n$\gamma$ = -.56"+"\nPearson\'s r = -0.93 ")
plt.xlabel("$\log_{10}$(Article Rank)", fontsize=14)
plt.ylabel("$\log_{10}$(Traversal Visits)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14) #axis font size
plt.legend(fontsize=14)
plt.plot(range(0, 8), [x*-0.636060 + 4.34266050449 for x in range(0, 8)])
slope, intercept, r_value, p_value, std_err = stats.linregress(df["log(rank)"][:10**5], df["log(traversal visits)"][:10**5])
print slope, intercept, r_value, p_value, std_err
plt.scatter(df["log(rank)"][:10**5], df["log(traversal visits)"][:10**5:], color="#F08080")
plt.xlabel("$\log_{10}$(Article Rank)", fontsize=14)
plt.ylabel("$\log_{10}$(Traversal Visits)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14) #axis font size
plt.legend(fontsize=14)
plt.plot(range(0, 6), [x*-1.22827104244 + 7.45682294098 for x in range(0, 6)])
slope, intercept, r_value, p_value, std_err = stats.linregress(df["log(rank)"][10**5:],
df["log(traversal visits)"][10**5:])
print slope, intercept, r_value, p_value, std_err
plt.scatter(df["log(rank)"][10**5:], df["log(traversal visits)"][10**5:], color="#F08080")
plt.xlabel("$\log_{10}$(Article Rank)", fontsize=14)
plt.ylabel("$\log_{10}$(Traversal Visits)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14) #axis font size
plt.legend(fontsize=14)
plt.plot(range(5, 8), [x*-0.57896217612 + 3.96006843729 for x in range(5, 8)])
plt.scatter(df["log(rank)"], df["log(traversal visits)"], color="#F08080",
label=None)
plt.xlabel("$\log_{10}$(Article Rank)", fontsize=14)
plt.ylabel("$\log_{10}$(Traversal Visits)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14) #axis font size
#top regime
plt.plot(range(0, 6), [x*-1.22827104244 + 7.45682294098 for x in range(0, 6)],
label=r'$\alpha$ = -1.23'+"\n$\gamma$ = 0.187"+"\nPearson\'s r = -0.99")
plt.legend(fontsize=14)
#bottom regime
plt.plot(range(5, 8), [x*-0.57896217612 + 3.96006843729 for x in range(5, 8)],
label="\n" + r'$\alpha$ = -0.579'+"\n$\gamma$ = -0.727"+"\nPearson\'s r = -0.93")
plt.legend(fontsize=14)
#define plot axis limits
axes = plt.gca()
xticks = axes.xaxis.get_major_ticks()
xticks[0].label1.set_visible(False)
yticks = axes.yaxis.get_major_ticks()
yticks[0].label1.set_visible(False)
#save figure
plt.savefig(path+'traversals_per_article.png', format='png', dpi=300, bbox_inches='tight')
plt.figure(figsize=(8,6))
sns.distplot(df["traversal visits"], kde=False, fit=stats.powerlaw)
df['traversal visits'].describe()
df.sort(columns='traversal visits').head(50)