Popularity by Page Views

based on https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageview_API

  • data is available starting Sep. 2015
In [1]:
import pandas as pd
from scipy import stats 
import numpy as np
import json
import requests

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

path = "/Users/mark/Dropbox/Math/Complex_Systems/research/wikipedia-network/paper/writeup/graphics/"
In [2]:
def get_page_views(article):
    """
    sums daily page view timeseries for Oct 2015 using api
    article title requires proper captilization
    """
    url_start = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/"
    url_end = "/daily/2015100100/2015103100"
    #replace whitespace with underscores
    article = article.replace (" ", "_")
    url = url_start + article + url_end
    page = requests.get(url)
    views_dict = json.loads(page.content)
    views = 0
    try:
        for daily_data in views_dict['items']:
            views += daily_data['views']
    except KeyError:
        pass
    return views
    
In [3]:
get_page_views("Philosophy")
Out[3]:
240642
In [4]:
#load traversal visits and funnels into dataframe (~ 2min runtime)

results_path = "/Users/mark/Desktop/wiki_v4/"

with open(results_path + "clicks.json") as f:
    visits_dict = json.load(f)
visits_df = pd.DataFrame(visits_dict.items())
visits_df.columns = ['article', 'traversal visits']
visits_df = visits_df.sort_values(by='traversal visits', ascending=False)


with open(results_path + "feed_count.json") as f:
    funnels_dict = json.load(f)
funnels_df = pd.DataFrame(funnels_dict.items())
funnels_df.columns = ['article', 'traversal funnels']
funnels_df = funnels_df.sort_values(by='traversal funnels', ascending=False)

Page Views and Traversal Visits

In [5]:
# runtime ~15min
    # run once (else load from file below)
pd.options.mode.chained_assignment = None  # default='warn'
#compute views for articles with highest traversal visits
visits_top_df = visits_df[:1000]
visits_top_df['page views'] = visits_top_df['article'].map(lambda x: get_page_views(x))

#save dataframe to file
temp_store = "/Users/mark/Desktop/wiki_v4/"
visits_top_df.to_json(temp_store + "top_views_and_visits.json")
In [ ]:
#read from file
temp_store = "/Users/mark/Desktop/wiki_v4/"
visits_top_df = pd.read_json(temp_store + "top_views_and_visits.json")
In [12]:
#describe dataframe, scatter plot visits versus views
visits_top_df.describe()
Out[12]:
page views traversal visits
count 1000.000000 1000.000000
mean 68629.603000 207574.081000
std 112218.905163 987874.415932
min 37.000000 7084.000000
25% 11783.000000 10420.000000
50% 31276.500000 20493.000000
75% 72645.750000 51427.500000
max 1393463.000000 7400884.000000

Top Articles Sorted by page views

In [14]:
visits_top_df.sort_values(by='page views', ascending=False)
Out[14]:
article page views traversal visits
7768454 United States 1393463 645237
9806172 India 1037860 87461
4681721 World War II 858787 13153
4931005 United Kingdom 776311 107720
5801744 Canada 744722 102345
8674350 China 726321 42541
6304743 Australia 617709 51048
8786739 Russia 610659 43487
161350 Germany 564323 97357
8343078 New York City 558418 18201
3831085 Israel 526162 17547
1566462 Japan 501838 65402
5566998 London 462797 15304
3567729 France 447101 83942
8467042 Pakistan 445458 14493
3530766 New Zealand 442155 16583
2793428 Turkey 422409 30091
185703 Philippines 400137 10915
1735401 Joseph Stalin 396668 20566
959048 Greek alphabet 389890 89930
917 South Africa 388899 18859
1532577 Hong Kong 384928 8019
7327787 Netherlands 384169 25228
5022396 England 378269 129050
7849273 Soviet Union 378045 19538
10526177 Switzerland 369581 42604
7638317 English language 368575 107230
6638963 California 358683 8841
6548425 Islam 355459 17129
5235135 United Nations 349268 23792
... ... ... ...
4597721 List of racquet sports 883 32523
7384809 Competitive 879 7328
11068298 Communes of Romania 852 22066
9478182 Biological nomenclature 844 8726
5463312 Titles 840 8245
3033232 Rural Districts of Iran 790 131746
532423 Police force 763 36433
4020648 Subkingdom 747 15848
9467948 Administrative divisions of Iran 713 131752
4318890 Published 663 42607
2708386 Political neologism 661 11873
11175663 County (USA) 647 10210
2154211 Administrative divisions 592 65516
2425312 Two party system 585 10183
9573257 Scholars 582 34472
5366522 United States law 567 7561
3831848 Global war 483 13154
4241057 Performing 446 13621
8799362 Suisse romande 422 7956
1496868 Overseas territories 383 58770
504660 Constituent entity 377 44685
337313 Names of Australian rules football 371 14401
158552 Commune in Romania 351 16460
9213569 Administrative subdivision 325 26662
9610099 Suisse Romande 268 7931
8619067 Country subdivisions 215 62507
5343283 Organic Acts 163 7463
1900802 Object (physics) 130 30777
7580925 Hip Hop Music 105 8552
632584 Census-Designated Place 37 24684

1000 rows × 3 columns

In [6]:
visits_top_df['traversal visits (in thousands)'] = visits_top_df['traversal visits'] / 1000.0

sns.jointplot(x="traversal visits (in thousands)", y="page views", data=visits_top_df, stat_func=None)

in log space

In [36]:
visits_top_df['log(traversal visits)'] = np.log10(visits_top_df['traversal visits'])
visits_top_df['log(page views)'] = np.log10(visits_top_df['page views'])

sns.set(font_scale=1.25)
g = sns.jointplot(visits_top_df["log(traversal visits)"], visits_top_df["log(page views)"], stat_func=None)

g.set_axis_labels("$\log_{10}$(traversal visits)", "$\log_{10}$(page views)", fontsize=14)


#save figure
plt.savefig(path+'views_visits.png', format='png', dpi=300, bbox_inches='tight')

with annotations

In [ ]:
sns.set(font_scale=1.25)
g = sns.jointplot(visits_top_df["log(traversal visits)"], visits_top_df["log(page views)"], stat_func=None)
g.annotate("United State", (5.809719, 6.144095)

g.set_axis_labels("$\log_{10}$(traversal visits)", "$\log_{10}$(page views)", fontsize=14)
In [16]:
g = sns.jointplot(x="log(traversal visits)", y="log(page views)", data=visits_top_df, kind="kde", color="g")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
Out[16]:
<seaborn.axisgrid.JointGrid at 0x19d2d18d0>
In [19]:
sns.jointplot(x="log(traversal visits)", y="log(page views)", data=visits_top_df, kind="kde")
Out[19]:
<seaborn.axisgrid.JointGrid at 0x19f5dc4d0>
In [30]:
#specific points in log-space
visits_top_df.sort_values(by='log(page views)', ascending=False)
Out[30]:
article page views traversal visits log(traversal visits) log(page views)
7768454 United States 1393463 645237 5.809719 6.144095
9806172 India 1037860 87461 4.941814 6.016139
4681721 World War II 858787 13153 4.119025 5.933885
4931005 United Kingdom 776311 107720 5.032296 5.890036
5801744 Canada 744722 102345 5.010067 5.871994
8674350 China 726321 42541 4.628808 5.861129
6304743 Australia 617709 51048 4.707979 5.790784
8786739 Russia 610659 43487 4.638359 5.785799
161350 Germany 564323 97357 4.988367 5.751528
8343078 New York City 558418 18201 4.260095 5.746959
3831085 Israel 526162 17547 4.244203 5.721119
1566462 Japan 501838 65402 4.815591 5.700564
5566998 London 462797 15304 4.184805 5.665391
3567729 France 447101 83942 4.923979 5.650406
8467042 Pakistan 445458 14493 4.161158 5.648807
3530766 New Zealand 442155 16583 4.219663 5.645575
2793428 Turkey 422409 30091 4.478437 5.625733
185703 Philippines 400137 10915 4.038024 5.602209
1735401 Joseph Stalin 396668 20566 4.313150 5.598427
959048 Greek alphabet 389890 89930 4.953905 5.590942
917 South Africa 388899 18859 4.275519 5.589837
1532577 Hong Kong 384928 8019 3.904120 5.585380
7327787 Netherlands 384169 25228 4.401883 5.584522
5022396 England 378269 129050 5.110758 5.577801
7849273 Soviet Union 378045 19538 4.290880 5.577543
10526177 Switzerland 369581 42604 4.629450 5.567710
7638317 English language 368575 107230 5.030316 5.566526
6638963 California 358683 8841 3.946501 5.554711
6548425 Islam 355459 17129 4.233732 5.550790
5235135 United Nations 349268 23792 4.376431 5.543159
... ... ... ... ... ...
4597721 List of racquet sports 883 32523 4.512191 2.945961
7384809 Competitive 879 7328 3.864985 2.943989
11068298 Communes of Romania 852 22066 4.343724 2.930440
9478182 Biological nomenclature 844 8726 3.940815 2.926342
5463312 Titles 840 8245 3.916191 2.924279
3033232 Rural Districts of Iran 790 131746 5.119737 2.897627
532423 Police force 763 36433 4.561495 2.882525
4020648 Subkingdom 747 15848 4.199974 2.873321
9467948 Administrative divisions of Iran 713 131752 5.119757 2.853090
4318890 Published 663 42607 4.629481 2.821514
2708386 Political neologism 661 11873 4.074560 2.820201
11175663 County (USA) 647 10210 4.009026 2.810904
2154211 Administrative divisions 592 65516 4.816347 2.772322
2425312 Two party system 585 10183 4.007876 2.767156
9573257 Scholars 582 34472 4.537466 2.764923
5366522 United States law 567 7561 3.878579 2.753583
3831848 Global war 483 13154 4.119058 2.683947
4241057 Performing 446 13621 4.134209 2.649335
8799362 Suisse romande 422 7956 3.900695 2.625312
1496868 Overseas territories 383 58770 4.769156 2.583199
504660 Constituent entity 377 44685 4.650162 2.576341
337313 Names of Australian rules football 371 14401 4.158393 2.569374
158552 Commune in Romania 351 16460 4.216430 2.545307
9213569 Administrative subdivision 325 26662 4.425893 2.511883
9610099 Suisse Romande 268 7931 3.899328 2.428135
8619067 Country subdivisions 215 62507 4.795929 2.332438
5343283 Organic Acts 163 7463 3.872913 2.212188
1900802 Object (physics) 130 30777 4.488226 2.113943
7580925 Hip Hop Music 105 8552 3.932068 2.021189
632584 Census-Designated Place 37 24684 4.392416 1.568202

1000 rows × 5 columns

Page Views and Traversal Funnels

In [20]:
funnels_top_df
Out[20]:
article traversal funnels
7948850 Philosophy 7374892
224026 Presentation 30799
9030902 Tree of life (biology) 29274
1344349 Southeast Europe 25745
11029885 Feudalism 19276
632584 Census-Designated Place 17483
7652704 United States Constitution 13952
7974918 Reality 13416
8629119 Health care 10762
7739754 BBC 8945
7580925 Hip Hop Music 7166
4495967 Consciousness 6587
5516532 Balkans 6547
3381363 Quality (philosophy) 5712
5866358 Biological system 5568
1749140 Secondary school 4624
4561872 Reservoir 4571
8281815 Armenia 3943
10404923 Dwelling 3767
5437039 Cancer 3219
5264253 Kingdom of France 3177
5253481 Web page 3113
3198232 Jurisdiction 3111
3442364 Affection 3075
8622161 Photography 2955
6420014 Secondary education 2939
7102628 Music magazine 2588
203945 Decimal 2573
8151674 Provinces of Armenia 2475
5744095 Dam 2472
... ... ...
10628005 Guided Reading 20
5801994 Payphone 20
3425866 Chronic Fatigue Syndrome 20
9525150 Testator 20
10842418 Computational Biology 20
4958528 György Lukács 20
1438974 Five for Fighting 20
3689052 Dj Khaled 20
9658910 Physic Garden 20
149116 Demolition Derby 20
599528 Acaricide 20
2199269 Gormenghast (series) 20
902229 Tao Qian (Han Dynasty) 20
1757144 Deutscher Olympischer Sportbund 20
304292 In Bed with Medinner 20
8212079 New Zealand Knights FC 20
10518076 Master of Ceremonies 20
3096656 Retinol 20
5183891 Óengus of Tallaght 20
10023534 Cromartie High School 20
2078583 Gourdou-Leseurre GL.2 20
10956595 Shamu 20
4602216 Nutley, New Jersey 20
9988769 Mountain rescue 20
9660859 Left Socialists 20
4112841 United States v. ElcomSoft and Sklyarov 19
6362224 Tigrinya language 19
1185440 Agatha of Sicily 19
6720790 Teleomorph, anamorph and holomorph 19
1962985 Ultra Naté 19

1000 rows × 2 columns

In [61]:
#runtime: ~15min

#alternate way to fetch views for top funnels (without sending too many server requests)
from time import sleep

#compute views for articles with highest traversal visits
funnels_top_df = funnels_df[:1000]

views_list = []

for i, a in enumerate(funnels_top_df['article']):
    try:
        views_list.append(get_page_views(a))
    except:
        try:
            sleep(1)
            views_list.append(get_page_views(a))
        except Exception as e:
            print i, a
            print e
            break

funnels_top_df['page views'] = views_list

#save to file
temp_store = "/Users/mark/Desktop/wiki_v4/"
funnels_top_df.to_json(temp_store + "top_views_and_funnels.json")
In [63]:
#read funnels and views from file
temp_store = "/Users/mark/Desktop/wiki_v4/"
funnels_top_df = pd.read_json(temp_store + "top_views_and_funnels.json")
In [64]:
funnels_top_df.describe()
Out[64]:
page views traversal funnels
count 1000.000000 1000.000000
mean 22238.469000 7761.809000
std 95473.283722 233210.959832
min 0.000000 19.000000
25% 1698.500000 28.000000
50% 6504.000000 48.000000
75% 20477.250000 120.000000
max 2795502.000000 7374892.000000
In [65]:
funnels_top_df.sort_values(by='page views', ascending=False)
Out[65]:
article page views traversal funnels
6742736 Halloween 2795502 186
527322 Scientology 463257 232
5759147 Clint Eastwood 341534 72
1496615 Cold War 298464 1692
8743226 Sia Furler 270155 28
8281815 Armenia 259607 3943
7948850 Philosophy 240642 7374892
7652704 United States Constitution 233635 13952
6759575 Narcissism 202935 56
5437039 Cancer 194337 3219
10256548 Backstreet Boys 190396 170
9452835 Namibia 181619 1562
9687129 Robert Redford 167117 41
9475418 Subhas Chandra Bose 166269 73
886977 Bee Gees 163602 332
2304956 Tsunami 159845 104
8487023 Feminism 157042 1429
10489688 List of countries by GDP (PPP) 153524 21
4875689 Howard Stern 138773 34
7082760 Anno Domini 136162 909
7306046 12-hour clock 135056 63
40574 Mandy Moore 128665 44
9872939 Zinc 124736 267
6327244 24-hour clock 122795 24
1647549 Angola 121071 2238
4135530 The Guardian 118335 410
7739754 BBC 117953 8945
7987624 KGB 116854 228
4904621 Avatar: The Last Airbender 116110 117
5964482 Marriage 116067 1714
... ... ... ...
5956724 List of Statutes of New Zealand 60 42
5260882 Fossil Fuel 59 1521
2033516 Iranian Presidential Election, 2009 58 98
10036224 Character Actor 55 380
10628005 Guided Reading 51 20
8173830 Solar Hijri Calendar 46 50
1064851 Perpetual Motion 45 56
200729 Medical Evacuation 44 102
4988691 Heads of Government of Swaziland 42 21
632584 Census-Designated Place 37 17483
6169986 Hybrid Electric Vehicle 33 241
9658910 Physic Garden 33 20
10992217 Retirement Community 31 35
4068617 Ceirano (disambiguation) 31 30
114831 Dance-Rock 29 26
10497963 Scat Singing 29 102
5852041 Institute of Astronomy (disambiguation) 27 20
2457128 Wood Engraving 25 42
11263522 Aragatsotn Region 24 512
9404988 Francis Of Assisi 23 23
8350645 Show Business 19 184
7955822 Book Of Esther 19 56
5577458 William Iii Of England 18 31
2963799 Queueing Theory 16 171
3175857 Caste System In India 16 181
2623881 Learning Disability 12 20
9397861 Palmoplantar Keratoderma 11 22
9410770 Sweat Gland 11 96
8937352 Crash Bar 3 22
191120 List of minor planets/25001–25100 0 73

1000 rows × 3 columns

In [66]:
sns.jointplot(x="traversal funnels", y="page views", data=funnels_top_df, stat_func=None)

#save figure
plt.savefig(path+'funnels_visits.png', format='png', dpi=300, bbox_inches='tight')

Excluding Philosophy as an outlier:

In [67]:
sns.jointplot(x="traversal funnels", y="page views", data=funnels_top_df[funnels_top_df['article'] != "Philosophy"], stat_func=None)

plt.text(-3000, 500000, "* excludes Philosophy \n (with 7.37 million funnels)")
Out[67]:
<matplotlib.text.Text at 0x1909cdd50>

in log space

In [68]:
funnels_top_df['log(traversal funnels)'] = np.log10(funnels_top_df['traversal funnels']+1)
funnels_top_df['log(page views)'] = np.log10(funnels_top_df['page views']+1)
In [81]:
sns.set(font_scale=1.25)
g = sns.jointplot(funnels_top_df["log(traversal funnels)"], funnels_top_df["log(page views)"], stat_func=None)
g.set_axis_labels("$\log_{10}$(traversal funnels)", "$\log_{10}$(page views)", fontsize=14)

#save figure
plt.savefig(path+ 'views_funnels.png', format='png', dpi=300, bbox_inches='tight')
In [77]:
funnels_top_df.sort_values(by="log(page views)", ascending=False).head(10)
Out[77]:
article page views traversal funnels log(traversal funnels) log(page views)
6742736 Halloween 2795502 186 2.271842 6.446460
527322 Scientology 463257 232 2.367356 5.665823
5759147 Clint Eastwood 341534 72 1.863323 5.533435
1496615 Cold War 298464 1692 3.228657 5.474893
8743226 Sia Furler 270155 28 1.462398 5.431615
8281815 Armenia 259607 3943 3.595937 5.414318
7948850 Philosophy 240642 7374892 6.867756 5.381373
7652704 United States Constitution 233635 13952 4.144668 5.368540
6759575 Narcissism 202935 56 1.755875 5.307359
5437039 Cancer 194337 3219 3.507856 5.288558
In [76]:
funnels_top_df.sort_values(by="log(traversal funnels)", ascending=False).head(10)
Out[76]:
article page views traversal funnels log(traversal funnels) log(page views)
7948850 Philosophy 240642 7374892 6.867756 5.381373
224026 Presentation 15787 30799 4.488551 4.198327
9030902 Tree of life (biology) 9571 29274 4.466497 3.981003
1344349 Southeast Europe 12216 25745 4.410710 4.086965
11029885 Feudalism 88076 19276 4.285039 4.944863
632584 Census-Designated Place 37 17483 4.242641 1.579784
7652704 United States Constitution 233635 13952 4.144668 5.368540
7974918 Reality 42940 13416 4.127655 4.632872
8629119 Health care 53926 10762 4.031933 4.731806
7739754 BBC 117953 8945 3.951629 5.071713
In [83]:
funnels_top_df.sort_values(by="log(page views)", ascending=True).head(10)
Out[83]:
article page views traversal funnels log(traversal funnels) log(page views)
191120 List of minor planets/25001–25100 0 73 1.869232 0.000000
8937352 Crash Bar 3 22 1.361728 0.602060
9410770 Sweat Gland 11 96 1.986772 1.079181
9397861 Palmoplantar Keratoderma 11 22 1.361728 1.079181
2623881 Learning Disability 12 20 1.322219 1.113943
2963799 Queueing Theory 16 171 2.235528 1.230449
3175857 Caste System In India 16 181 2.260071 1.230449
5577458 William Iii Of England 18 31 1.505150 1.278754
7955822 Book Of Esther 19 56 1.755875 1.301030
8350645 Show Business 19 184 2.267172 1.301030
In [ ]: