Article rank by traverals visits vs. number of synonyms

  • synonyms are from WordNet
In [55]:
from nltk.corpus import wordnet as wn
import pandas as pd
import json
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

path = "/Users/mark/Dropbox/Math/Complex_Systems/research/wikipedia-network/writeup/graphics/"
In [55]:
#load highest ranking articles
results_path = "/Users/mark/Desktop/wiki_v4/"
with open(results_path + "clicks.json") as f:
    dict = json.load(f)
df = pd.DataFrame(dict.items())
df.columns = ['page', 'visits']
In [3]:
def get_num_syn(word):
    syn_num = 0
    syn_sets = wn.synsets(word)
    for syn_set in syn_sets:
        syn_num += len(syn_set.lemma_names())
    return syn_num
        
In [4]:
#add num syn dataframe colum for all articles 

top_df = df.sort(columns='visits', ascending=False)
top_df['number synonyms']= top_df['page'].map(get_num_syn)
top_df['rank'] = range(top_df.shape[0])
In [5]:
ex_zero_df = top_df[top_df['number synonyms'] > 0]
sns.boxplot(x="number synonyms", data=ex_zero_df)
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x19fa0b510>
In [67]:
len(ex_zero_df)
Out[67]:
93868
In [74]:
ex_zero_df['groups'] = ['top 100 articles']*100 + ['all other articles']*(len(ex_zero_df) - 100)
<matplotlib.figure.Figure at 0x2465e8f90>
In [85]:
sns.boxplot(x='number synonyms', y='groups', data=ex_zero_df)
plt.text(150, 0.5, "*excludes articles \n with no synonyms")

#save figure
plt.savefig(path+'synonyms.png', format='png', dpi=300, bbox_inches='tight')
In [77]:
In [52]:
 
In [16]:
sns.lmplot('rank', 'number synonyms', data=top_df, fit_reg=False)
Out[16]:
<seaborn.axisgrid.FacetGrid at 0x14b3c9290>
In [35]:
#average syn of top 100 pages
#vs. average syn of sample
top_df[:100]['number synonyms'].sum()
Out[35]:
501
In [7]:
np.mean(top_df[:100]['number synonyms'])
Out[7]:
5.0099999999999998
In [8]:
#rest of dataset
np.mean(top_df[100:]['number synonyms'])
Out[8]:
0.04851812921272694
In [11]:
#highest ranking median
np.median(top_df[:100]['number synonyms'])
Out[11]:
2.5
In [12]:
#rest median
np.mean(top_df[100:]['number synonyms'])
Out[12]:
0.04851812921272694
In [38]:
#top 100 mean exclude zeros
np.mean(top_df[:100][top_df[:100]['number synonyms'] > 0])
Out[38]:
visits             2331610.400000
number synonyms          9.109091
rank                    44.145455
dtype: float64
In [39]:
#mean of rest, exclude zeros
np.mean(top_df[100:][top_df[100:]['number synonyms'] > 0])
Out[39]:
visits                 181.215525
number synonyms          5.832454
rank               4018477.125590
dtype: float64
In [41]:
np.median(top_df[100:][top_df[100:]['number synonyms'] > 0]['number synonyms'])
Out[41]:
3.0
In [42]:
np.median(top_df[:100][top_df[:100]['number synonyms'] > 0]['number synonyms'])
Out[42]:
7.0