📚 The CoCalc Library - books, templates and other resources
License: OTHER
#github data scrapper12"""3variables of interest:4indp. variables5- language, given as a binary variable. Need 4 positions for 5 langagues6- #number of days created ago, 1 position7- has wiki? Boolean, 1 position8- followers, 1 position9- following, 1 position10- constant1112dep. variables13-stars/watchers14-forks1516"""17from json import loads18import datetime19import numpy as np20from requests import get212223MAX = 800000024today = datetime.datetime.today()25randint = np.random.randint26N = 120 #sample size.27auth = ("username", "password" )2829language_mappings = {"Python": 0, "JavaScript": 1, "Ruby": 2, "Java":3, "Shell":4, "PHP":5}3031#define data matrix:32X = np.zeros( (N , 12), dtype = int )3334for i in xrange(N):35is_fork = True36is_valid_language = False3738while is_fork == True or is_valid_language == False:39is_fork = True40is_valid_language = False4142params = {"since":randint(0, MAX ) }43r = get("https://api.github.com/repositories", params = params, auth=auth )44results = loads( r.text )[0]45#im only interested in the first one, and if it is not a fork.46is_fork = results["fork"]4748r = get( results["url"], auth = auth)4950#check the language51repo_results = loads( r.text )52try:53language_mappings[ repo_results["language" ] ]54is_valid_language = True55except:56pass5758#languages59X[ i, language_mappings[ repo_results["language" ] ] ] = 16061#delta time62X[ i, 6] = ( today - datetime.datetime.strptime( repo_results["created_at"][:10], "%Y-%m-%d" ) ).days6364#haswiki65X[i, 7] = repo_results["has_wiki"]6667#get user information68r = get( results["owner"]["url"] , auth = auth)69user_results = loads( r.text )70X[i, 8] = user_results["following"]71X[i, 9] = user_results["followers"]7273#get dep. data74X[i, 10] = repo_results["watchers_count"]75X[i, 11] = repo_results["forks_count"]7677print " -------------- "78print i, ": ", results["full_name"], repo_results["language" ], repo_results["watchers_count"], repo_results["forks_count"]79print " -------------- "808182np.savetxt("data/github_data.csv", X, delimiter=",", fmt="%d" )838485