CoCalc -- churn_measurements.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / data-science-ipython-notebooks / analyses / churn_measurements.py
¹³²⁹²³ views
License: OTHER
1
from __future__ import division
2
import numpy as np
3

4
__author__ = "Eric Chiang"
5
__email__  = "eric[at]yhathq.com"
6

7
"""
8

9
Measurements inspired by Philip Tetlock's "Expert Political Judgment"
10

11
Equations take from Yaniv, Yates, & Smith (1991):
12
  "Measures of Descrimination Skill in Probabilistic Judgement"
13

14
"""
15

16

17
def calibration(prob,outcome,n_bins=10):
18
    """Calibration measurement for a set of predictions.
19

20
    When predicting events at a given probability, how far is frequency
21
    of positive outcomes from that probability?
22
    NOTE: Lower scores are better
23

24
    prob: array_like, float
25
        Probability estimates for a set of events
26

27
    outcome: array_like, bool
28
        If event predicted occurred
29

30
    n_bins: int
31
        Number of judgement categories to prefrom calculation over.
32
        Prediction are binned based on probability, since "descrete" 
33
        probabilities aren't required. 
34

35
    """
36
    prob = np.array(prob)
37
    outcome = np.array(outcome)
38

39
    c = 0.0
40
    # Construct bins
41
    judgement_bins = np.arange(n_bins + 1) / n_bins
42
    # Which bin is each prediction in?
43
    bin_num = np.digitize(prob,judgement_bins)
44
    for j_bin in np.unique(bin_num):
45
        # Is event in bin
46
        in_bin = bin_num == j_bin
47
        # Predicted probability taken as average of preds in bin
48
        predicted_prob = np.mean(prob[in_bin])
49
        # How often did events in this bin actually happen?
50
        true_bin_prob = np.mean(outcome[in_bin])
51
        # Squared distance between predicted and true times num of obs
52
        c += np.sum(in_bin) * ((predicted_prob - true_bin_prob) ** 2)
53
    return c / len(prob)
54

55
def discrimination(prob,outcome,n_bins=10):
56
    """Discrimination measurement for a set of predictions.
57

58
    For each judgement category, how far from the base probability
59
    is the true frequency of that bin?
60
    NOTE: High scores are better
61

62
    prob: array_like, float
63
        Probability estimates for a set of events
64

65
    outcome: array_like, bool
66
        If event predicted occurred
67

68
    n_bins: int
69
        Number of judgement categories to prefrom calculation over.
70
        Prediction are binned based on probability, since "descrete" 
71
        probabilities aren't required. 
72

73
    """
74
    prob = np.array(prob)
75
    outcome = np.array(outcome)
76

77
    d = 0.0
78
    # Base frequency of outcomes
79
    base_prob = np.mean(outcome)
80
    # Construct bins
81
    judgement_bins = np.arange(n_bins + 1) / n_bins
82
    # Which bin is each prediction in?
83
    bin_num = np.digitize(prob,judgement_bins)
84
    for j_bin in np.unique(bin_num):
85
        in_bin = bin_num == j_bin
86
        true_bin_prob = np.mean(outcome[in_bin])
87
        # Squared distance between true and base times num of obs
88
        d += np.sum(in_bin) * ((true_bin_prob - base_prob) ** 2)
89
    return d / len(prob)
90

91
Product

Resources

Company