import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
path = "/Users/mark/Dropbox/Math/Complex_Systems/research/wikipedia-network/paper/writeup/graphics/"
#load results into dataframe (~ 1min runtime)
results_path = "/Users/mark/Desktop/wiki_v4/"
with open(results_path + "lengths.json") as f:
dict = json.load(f)
df = pd.DataFrame(dict.items())
df.columns = ['page', 'path length']
path length: number of first link traversals up to a repeated article or an invalid link
df.sort(columns='path length', ascending=False).head(50)
top1k= df.sort(columns='path length', ascending=False).head(1000)
top1k[top1k['page'].apply(lambda e: "liturgics" not in e.lower())]
df['path length'].sum()
df[df['path length'] == 0].count()
df[df['path length'] == 0].head(10)
includes invalid links
df[df['path length'] == 2].count()
includes invalid links
df[df['path length'] == 3].count()
df['path length'].mode()
df[df['path length'] == 29].count()
df.describe()
75% of pages traverse fewer than 30 first links!
plt.figure(figsize=(8,6))
sns.set_style("whitegrid")
sns.boxplot(x=df["path length"])
plt.savefig(path+'path_lengths_boxplot.png', format='png', dpi=300, bbox_inches='tight')
plt.figure(figsize=(8,6))
sns.distplot(df["path length"])
# runtime ~3min
plt.figure(figsize=(8,6))
with sns.axes_style("whitegrid"):
sns.kdeplot(df["path length"], shade=True, legend=False, color='g')
sns.despine(left=True)
#make axis font size larger
plt.tick_params(axis='both', which='major', labelsize=14)
#labels
plt.xlabel("path length", fontsize=14)
plt.ylabel("density", fontsize=14)
# runtime ~3min
plt.figure(figsize=(8,6))
with sns.axes_style("whitegrid"):
sns.kdeplot(np.log10(df["path length"] + 1), shade=True, legend=False, color='g')
sns.despine(left=True)
#make axis font size larger
plt.tick_params(axis='both', which='major', labelsize=14)
#labels
plt.xlabel("$\log_{10}$(path length)", fontsize=14)
plt.ylabel("density", fontsize=14)
#defaults
sns.set()
plt.figure(figsize=(8,6))
with sns.axes_style("whitegrid"):
sns.kdeplot(df["path length"], shade=True, legend=False, color='g')
sns.despine(left=True)
#make axis font size larger
plt.tick_params(axis='both', which='major', labelsize=14)
#labels
plt.xlabel("path length", fontsize=14)
plt.ylabel("density", fontsize=14)
#define plot axis limits
axes = plt.gca()
xticks = axes.xaxis.get_major_ticks()
xticks[0].label1.set_visible(False)
yticks = axes.yaxis.get_major_ticks()
yticks[0].label1.set_visible(False)
sns.set_style("dark")
#subplot in top corner
a = plt.axes([.53, .51, .37, .37], axisbg='y')
with sns.axes_style("whitegrid"):
sns.kdeplot(np.log10(df["path length"] + 1), shade=True, legend=False, color='g')
sns.despine(left=True)
#make axis font size larger
plt.tick_params(axis='both', which='major', labelsize=14)
#labels
plt.xlabel("$\log_{10}$(path length)", fontsize=14)
plt.ylabel("density", fontsize=14)
#define plot axis limits
axes = plt.gca()
xticks = axes.xaxis.get_major_ticks()
xticks[0].label1.set_visible(False)
yticks = axes.yaxis.get_major_ticks()
yticks[0].label1.set_visible(False)
#transparent
a.patch.set_alpha(0.1)
#back to defaults
sns.set()
#save figure
plt.savefig(path+'path_lengths_dist.png', format='png', dpi=300, bbox_inches='tight')
toppdf_10k = df.sort(columns='path length', ascending=False).head(10000)['path length']
toppdf_1k = df.sort(columns='path length', ascending=False).head(1000)['path length']
toppdf_100 = df.sort(columns='path length', ascending=False).head(100)['path length']
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
visits_plot1 = toppdf_10k.plot(kind='hist', title="Pages with Longest Paths (top 10k)")
visits_plot1.set_xlabel("path length")
visits_plot1.set_ylabel("frequency")
visits_plot2 = toppdf_1k.plot(kind='hist', title="Pages with Longest Paths (top 1k)")
visits_plot2.set_xlabel("path length")
visits_plot2.set_ylabel("frequency")
visits_plot3 = toppdf_100.plot(kind='hist', title="Pages with Longest Paths (top 100)")
visits_plot3.set_xlabel("path length")
visits_plot3.set_ylabel("frequency")