from collections import defaultdict
import powerlaw
import pandas as pd
from scipy import stats
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
path = "/Users/mark/Dropbox/Math/Complex_Systems/research/wikipedia-network/paper/writeup/graphics/"
results_path = "/Users/mark/Desktop/wiki_v4/"
#load results into dataframe (~ 1min runtime)
results_path = "/Users/mark/Desktop/wiki_v4/"
with open(results_path + "clicks.json") as f:
dict = json.load(f)
df = pd.DataFrame(dict.items())
df.columns = ['article', 'traversal visits']
df = df.sort_values(by='traversal visits', ascending=False)
df['rank'] = np.arange(1, df.shape[0]+1)
powerlaw_fit_visits = powerlaw.Fit(df['traversal visits'], discrete=True)
print "alpha: ", powerlaw_fit_visits.alpha
print "xmin: ", powerlaw_fit_visits.xmin
print "gamma: ", 1 / powerlaw_fit_visits.alpha + 1
print "Kolmogorov Distance: ", powerlaw_fit_visits.D
print "xmax:", powerlaw_fit_visits.xmax
1/1.72 + 1
plt.figure(figsize=(8,6))
plt.scatter(df["rank"], df["traversal visits"], color="#ff7f0e")
plt.tick_params(axis='both', which='major', labelsize=14)
#change axis labeling to 10^#
axis = plt.gca()
axis.set_yscale('symlog')
axis.set_xscale('log')
#axis limits
axis.set_ylim(bottom=-0.1, top=10**8)
plt.title("Traversal Visits Distribution")
plt.xlabel("Article Rank")
plt.ylabel("Traversal Visits")
figCCDF = powerlaw_fit_visits.plot_pdf(color='#ff7f0e', linewidth=2)
powerlaw_fit_visits.power_law.plot_pdf(color='g', linestyle='--', ax=figCCDF,
label="power law fit\n"+ r'$\alpha$ = 1.7,' + r'$\gamma$ = 1.6')
plt.tick_params(axis='both', which='major', labelsize=14)
#fewer y-axis ticks
yticks = plt.gca().yaxis.get_major_ticks()
for i, y_l in enumerate(yticks):
if i % 2 == 0:
y_l.label1.set_visible(False)
plt.gca().set_xlim(left=1)
plt.gca().set_ylim(top=1)
plt.ylabel("PDF", fontsize=14)
plt.xlabel("Traversal Visits", fontsize=14)
plt.legend(numpoints=2)
#defaults
sns.set()
plt.figure(figsize=(8,6))
plt.figure(figsize=(8,6))
plt.scatter(df["rank"], df["traversal visits"], color="#ff7f0e")
plt.tick_params(axis='both', which='major', labelsize=14)
#change axis labeling to 10^#
axis = plt.gca()
axis.set_yscale('symlog')
axis.set_xscale('log')
#axis limits
axis.set_ylim(bottom=-0.1, top=10**9)
axis.set_xlim(left=0.5, right=10**9)
plt.title("Traversal Visits Distribution", fontsize=14)
plt.xlabel("Article Rank", fontsize=14)
plt.ylabel("Traversal Visits", fontsize=14)
sns.set_style("dark")
#subplot in top corner
a = plt.axes([.50, .53, .38, .34], axisbg='y')
figCCDF = powerlaw_fit_visits.plot_pdf(color='#ff7f0e', linewidth=2)
powerlaw_fit_visits.power_law.plot_pdf(color='g', linestyle='--', ax=figCCDF,
label="power law fit\n"+ r'$\alpha \simeq$ 1.72,' + r'$\gamma \simeq$ 1.58')
plt.tick_params(axis='both', which='major', labelsize=14)
a.set_xlim(left=10**2, right=10**8)
a.set_ylim(top=10**(-2.5))
plt.ylabel("PDF", fontsize=14)
plt.xlabel("Traversal Visits", fontsize=14)
plt.legend(numpoints=2, fontsize=14)
#fewer y-axis ticks
yticks = plt.gca().yaxis.get_major_ticks()
for i, y_l in enumerate(yticks):
if i % 2 == 0:
y_l.label1.set_visible(False)
#transparent
a.patch.set_alpha(0.1)
#reset defaults
sns.set()
#save figure
plt.savefig(path+'traversals_per_article.png', format='png', dpi=300, bbox_inches='tight')
R, p = powerlaw_fit_visits.distribution_compare('power_law', 'lognormal')
print "R: ", R
print "p: ", p
R, p = powerlaw_fit_visits.distribution_compare('power_law', 'exponential')
print "R: ", R
print "p: ", p
R, p = powerlaw_fit_visits.distribution_compare('power_law', 'stretched_exponential')
print "R: ", R
print "p: ", p
R, p = powerlaw_fit_visits.distribution_compare('power_law', 'truncated_power_law')
print "R: ", R
print "p: ", p