Traversal Visits¶

measured as the accumulated number of visits after traversing the network:

begin at every article
first link receives +1 visit
continue to next first link, +1 visit
- stop at a repeated article or invalid link

import pandas as pd
from scipy import stats 
import numpy as np
import json

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

path = "/Users/mark/Dropbox/Math/Complex_Systems/research/wikipedia-network/paper/writeup/graphics/"

#load results into dataframe (~ 1min runtime)

results_path = "/Users/mark/Desktop/wiki_v4/"
with open(results_path + "clicks.json") as f:
    dict = json.load(f)
df = pd.DataFrame(dict.items())
df.columns = ['article', 'traversal visits']

df = df.sort_values(by='traversal visits', ascending=False)

#add a rank column and log scaled columns

df['log(traversal visits)'] = np.log10(df['traversal visits']+1)
df['rank'] = np.arange(1, df.shape[0]+1)
df['log(rank)'] = np.log10(df['rank'])

df['traversal visits (m)'] = df['traversal visits'] / 1000000

How many articles are there?¶

df.count()

article             11277534
traversal visits    11277534
dtype: int64

How many articles have 0 traversal visits?¶

df[df['traversal visits'] == 0].count()

page      8567636
visits    8567636
dtype: int64

How many articles have fewer than 100 traversal visits?¶

df[df['traversal visits'] < 100].count()

article             11251449
traversal visits    11251449
dtype: int64

How many articles have more than 100 traversal visits?¶

df[df['traversal visits'] > 100].count()

page      25845
visits    25845
dtype: int64

What's the distribution of traversal visits?¶

plt.figure(figsize=(8,6))
visits_plot1 = df['traversal visits (m)'].head(10000).plot(kind='hist', title="Top 10,000 Articles")
visits_plot1.set_xlabel("traversal visits (millions)")
visits_plot1.set_ylabel("frequency")

<matplotlib.text.Text at 0x13e623c10>

plt.figure(figsize=(8,6))
visits_plot2 = df['traversal visits (m)'].head(1000).plot(kind='hist', title="Top 1,000 Articles")
visits_plot2.set_xlabel("traversal visits (millions)")
visits_plot2.set_ylabel("frequency")

#save figure
plt.savefig(path+'top_1k_article_traversals.png', format='png', dpi=300, bbox_inches='tight')

plt.figure(figsize=(8,6))
visits_plot3 = df['traversal visits (m)'].head(100).plot(kind='hist', title="Top 100 Articles")
visits_plot3.set_xlabel("traversal visits (millions)")
visits_plot3.set_ylabel("frequency")

#save figure
plt.savefig(path+'top_100_article_traversals.png', format='png', dpi=300, bbox_inches='tight')

all articles¶

plt.figure(figsize=(8,6))
sns.violinplot(y='traversal visits', data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x20cba3b90>

#log traversal visits
plt.figure(figsize=(8,6))

sns.violinplot(y='log(traversal visits)', data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x10f2f7050>

/Users/mark/Envs/wiki-py2/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

#traversal visists versus rank
plt.figure(figsize=(8,6))

plt.scatter(df["rank"], df["traversal visits"], color="#6495ED")
plt.title("Traversal Visits per Article")
plt.ylabel("Number of Traversal Visits")
plt.xlabel("Article Rank (by traversal visits)")

<matplotlib.text.Text at 0x20bae16d0>

On a Logarithmic Scale¶

#linear fit of log(rank) vs. log(traveral visits)
slope, intercept, r_value, p_value, std_err = stats.linregress(df["log(rank)"], df["log(traversal visits)"])
print slope, intercept, r_value, p_value, std_err

-0.636060645731 4.34266050449 -0.930719619472 0.0 7.44281732628e-05

Power-law exponent: -0.636¶

plt.figure(figsize=(8,6))

plt.scatter(df["log(rank)"], df["log(traversal visits)"],  color="#F08080", label="r-value = -0.93")

plt.title("Traversal Visits Distribution")
plt.xlabel("log(Article Rank)")
plt.ylabel("log(Traversal Visits)")
plt.legend()

plt.scatter(df["log(rank)"], df["log(traversal visits)"], color="#F08080",
                        label=r'$\alpha$ = -0.64'+"\n$\gamma$ = -.56"+"\nPearson\'s r = -0.93 ")
plt.xlabel("$\log_{10}$(Article Rank)", fontsize=14)
plt.ylabel("$\log_{10}$(Traversal Visits)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14) #axis font size
plt.legend(fontsize=14)

plt.plot(range(0, 8), [x*-0.636060 + 4.34266050449 for x in range(0, 8)])

[<matplotlib.lines.Line2D at 0x138710850>]

Top Regime¶

slope, intercept, r_value, p_value, std_err = stats.linregress(df["log(rank)"][:10**5], df["log(traversal visits)"][:10**5])
print slope, intercept, r_value, p_value, std_err

-1.22827104244 7.45682294098 -0.998440306922 0.0 0.000217190894288

plt.scatter(df["log(rank)"][:10**5], df["log(traversal visits)"][:10**5:], color="#F08080")
plt.xlabel("$\log_{10}$(Article Rank)", fontsize=14)
plt.ylabel("$\log_{10}$(Traversal Visits)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14) #axis font size
plt.legend(fontsize=14)

plt.plot(range(0, 6), [x*-1.22827104244 + 7.45682294098 for x in range(0, 6)])

[<matplotlib.lines.Line2D at 0x1304f4190>]

Bottom Regime¶

slope, intercept, r_value, p_value, std_err = stats.linregress(df["log(rank)"][10**5:], 
                                                               df["log(traversal visits)"][10**5:])
print slope, intercept, r_value, p_value, std_err

-0.57896217612 3.96006843729 -0.919092356404 0.0 7.42439023587e-05

plt.scatter(df["log(rank)"][10**5:], df["log(traversal visits)"][10**5:], color="#F08080")
plt.xlabel("$\log_{10}$(Article Rank)", fontsize=14)
plt.ylabel("$\log_{10}$(Traversal Visits)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14) #axis font size
plt.legend(fontsize=14)

plt.plot(range(5, 8), [x*-0.57896217612 + 3.96006843729 for x in range(5, 8)])

[<matplotlib.lines.Line2D at 0x13d58a510>]

combined plot with regimes¶

plt.scatter(df["log(rank)"], df["log(traversal visits)"], color="#F08080", 
            label=None)
plt.xlabel("$\log_{10}$(Article Rank)", fontsize=14)
plt.ylabel("$\log_{10}$(Traversal Visits)", fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14) #axis font size

#top regime
plt.plot(range(0, 6), [x*-1.22827104244 + 7.45682294098 for x in range(0, 6)], 
         label=r'$\alpha$ = -1.23'+"\n$\gamma$ = 0.187"+"\nPearson\'s r = -0.99")
plt.legend(fontsize=14)

#bottom regime
plt.plot(range(5, 8), [x*-0.57896217612 + 3.96006843729 for x in range(5, 8)],
        label="\n" + r'$\alpha$ = -0.579'+"\n$\gamma$ = -0.727"+"\nPearson\'s r = -0.93")
plt.legend(fontsize=14)

#define plot axis limits
axes = plt.gca()
xticks = axes.xaxis.get_major_ticks()
xticks[0].label1.set_visible(False)
yticks = axes.yaxis.get_major_ticks()
yticks[0].label1.set_visible(False)

#save figure
plt.savefig(path+'traversals_per_article.png', format='png', dpi=300, bbox_inches='tight')

Fit to a Power Law¶

For full analysis of Power Law Fit see "power_law_visits.ipynb"

plt.figure(figsize=(8,6))

sns.distplot(df["traversal visits"], kde=False, fit=stats.powerlaw)

<matplotlib.axes._subplots.AxesSubplot at 0x115541810>

Descriptive Statistics¶

df['traversal visits'].describe()

count    11277534.000000
mean           20.603523
std          9501.095584
min             0.000000
25%             0.000000
50%             0.000000
75%             0.000000
max       7400884.000000
Name: traversal visits, dtype: float64

A sample of articles with 0 traversal vists¶

df.sort(columns='traversal visits').head(50)

	article	traversal visits
1031718	Existence	7400884
3381363	Quality (philosophy)	7400884
4495967	Consciousness	7400884
1020904	Conscious	7400884
973077	Awareness	7400884
7948850	Philosophy	7400884
7974918	Reality	7400884
4042014	Modern philosophy	7269259
1584377	Property (philosophy)	7269255
7062295	Quantity	7259791
6069882	Mathematics	7255122
2860219	Set (mathematics)	7079148
7332038	Explanation	6622934
10976731	Hypothesis	6616886
5125755	Experiment	6616203
3512481	Experience	6614083
9387432	Fact	6613815
4514332	Knowledge	6613248
9531996	Science	6444926
6731720	Natural science	4633960
9081904	Biology	3400690
4579566	Tribe (biology)	2028827
5133222	Hominini	2027606
7799954	Human	2027495
5997652	World	1509110
8371182	Community	1412646
5587156	State (polity)	1401610
4178551	Earth	1335477
462809	Geography	1297144
4911286	Organism	993743
1101605	Human geography	910619
9429023	Political geography	910554
501901	Country	910528
2579735	Physics	905296
10256925	Political union	858725
7160526	Federation	858699
2193	Social science	835526
6063250	Federal republic	723998
7768454	United States	645237
6527028	Administrative division	466333
7830566	Data	455691
4507680	Information	425296
2909326	Power (social and political)	417078
2696928	Biological interaction	413946
3386346	Competition (biology)	413936
4513068	Competition	413358
6308195	Sport	407781
2147984	Landmass	404722
5220083	Continent	404716
1513596	Communication	390890

	article	traversal visits
5638766	Topspin 3	0
9645400	Marco Zwyssig	0
5840661	Bobby Webb	0
5840659	The Lord of the Nazgûl	0
5840658	Mansuri, Bafq	0
5840657	Nyctemera fasciata	0
5840656	Kiss of Life (film)	0
9645402	Darreh-ye Gachi Ab Deyfeh	0
5840654	Billy Lang	0
5840653	911 gt3	0
9645403	Tony Clownarelli	0
5840651	James Molloy	0
9645405	Hitlerjunge Quex	0
5840649	Andrea Barberi	0
5840648	1961-62 nhl season	0
5840647	Jurassic Park (film series)	0
9645406	Roberto cabot	0
5840645	Al Najjada	0
9645407	2011–12 Biathlon World Cup – World Cup 8	0
5840643	Dog Eat Dog (US game show)	0
5840642	Koraku-en	0
9645408	Cowiedesmus eroticopodus	0
5840640	Kynzvart	0
5840639	Astrological house	0
5840638	Underwater Football	0
5840637	Doggie kruger	0
5840636	No Doubt (Petra album)	0
5840635	Chala Bagundi	0
5840634	A Killer Wthin	0
5840633	Paul Chesterton	0
5840664	Matt de la Pena	0
5840632	Mount Cook (Saint Elias Mountains)	0
9645398	Peter J. Conradi	0
5840667	1990 European Athletics Championships - Women'...	0
9645392	Kumkum..Ek Pyaara Sa Bandhan	0
9645393	Thalia dealbata	0
5840694	Mt. Davidson	0
5840693	Gum trees	0
5840692	Bekhaye	0
5840691	Lonsdale Street	0
5840690	Ecumenical views of Mary	0
5840689	Beatrice M. Tinsley Prize	0
5840688	Joanne C. Benson	0
5840687	Anonymus Belae regis notarius	0
9645394	Chinese Palaces	0
5840685	Western high plateau	0
5840684	Sir Derek Milman, 9th Baronet	0
5840683	Deghtzut	0
5840682	Boulengerochromis microlepis	0
5840681	List of butterflies Niue	0

Traversal Visits¶

Top 50 Articles¶

How many articles are there?¶

How many articles have 0 traversal visits?¶

How many articles have fewer than 100 traversal visits?¶

How many articles have more than 100 traversal visits?¶

What's the distribution of traversal visits?¶

all articles¶

On a Logarithmic Scale¶

Power-law exponent: -0.636¶

Top Regime¶

Bottom Regime¶

combined plot with regimes¶

Fit to a Power Law¶

Descriptive Statistics¶

A sample of articles with 0 traversal vists¶