CoCalc -- github_datapull.py

⁵³⁸⁰ views
1

2
try:
3
    import numpy as np
4
    from requests import get
5
    from bs4 import BeautifulSoup
6

7

8

9

10
    stars_to_explore = ( 2**np.arange( -1, 16 ) ).astype("int")
11
    forks_to_explore = ( 2**np.arange( -1, 16 ) ).astype("int")
12
    repo_with_stars = np.ones_like( stars_to_explore )
13
    repo_with_forks = np.ones_like( forks_to_explore )
14

15
    URL = "https://github.com/search"
16
    print "Scrapping data from Github. Sorry Github..."
17
    print "The data is contained in variables `foo_to_explore` and `repo_with_foo`"
18
    print
19
    print "stars first..."
20
    payload = {"q":""}
21
    for i, _star in enumerate(stars_to_explore):
22
        payload["q"] = "stars:>=%d"%_star
23
        r = get( URL, params = payload )
24
        soup = BeautifulSoup( r.text )
25
        try:
26
            h3 = soup.find( class_="sort-bar").find( "h3" ).text #hopefully the github search results page plays nicely.
27
            value = int( h3.split(" ")[2].replace(",", "" ) )
28
        except AttributeError as e:
29
            #there might be less than 10 repos, so I'll count the number of display results
30
            value  = len( soup.findAll(class_= "mega-icon-public-repo" ) )
31
        
32
        repo_with_stars[i] = value
33
        print "number of repos with greater than or equal to %d stars: %d"%(_star, value )
34
    
35
    #repo_with_stars = repo_with_stars.astype("float")/repo_with_stars[0]
36

37

38
    print 
39
    print "forks second..."
40
    payload = {"q":""}
41
    for i, _fork in enumerate(stars_to_explore):
42
        payload["q"] = "forks:>=%d"%_fork
43
        r = get( URL, params = payload )
44
        soup = BeautifulSoup( r.text )
45
        try:
46
            h3 = soup.find( class_="sort-bar").find( "h3" ).text #hopefully the github search results page plays nicely.
47
            value = int( h3.split(" ")[2].replace(",", "" ) )
48
        except AttributeError as e:
49
            #there might be less than 10 repos, so I'll count the number of display results
50
            value  = len( soup.findAll(class_= "mega-icon-public-repo" ) )
51
        
52
        repo_with_forks[i] = value
53
        print "number of repos with greater than or equal to %d forks: %d"%(_fork, value )
54
    
55
    #repo_with_forks = repo_with_forks.astype("float")/repo_with_forks[0]
56
    
57
    np.savetxt( "data/gh_forks.csv", np.concatenate( [forks_to_explore, repo_with_forks], axis=1) )
58
    np.savetxt( "data/gh_stars.csv", np.concatenate( [stars_to_explore, repo_with_stars], axis=1) )
59

60
except ImportError as e:
61
    print e
62
    print "requests / BeautifulSoup not found. Using data pulled on Feburary 11, 2013"
63
    _data = np.genfromtxt( "data/gh_forks.csv", delimiter = "," ) #check this.
64
    forks_to_explore = _data[:,0]
65
    repo_with_forks  = _data[:,1]    
66
    
67
    _data = np.genfromtxt( "data/gh_stars.csv", delimiter = "," ) #check this.
68
    stars_to_explore = _data[:,0]
69
    repo_with_stars  = _data[:,1]
70
    
71
    
72
    
73
Product

Resources

Company