📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This module contains a code example related to12Think Python, 2nd Edition3by Allen Downey4http://thinkpython2.com56Copyright 2015 Allen Downey78License: http://creativecommons.org/licenses/by/4.0/9"""1011from __future__ import print_function, division1213import random14import string1516def process_file(filename, skip_header):17"""Makes a histogram that contains the words from a file.1819filename: string20skip_header: boolean, whether to skip the Gutenberg header2122returns: map from each word to the number of times it appears.23"""24hist = {}25fp = open(filename)2627if skip_header:28skip_gutenberg_header(fp)2930for line in fp:31if line.startswith('*** END OF THIS'):32break3334process_line(line, hist)3536return hist373839def skip_gutenberg_header(fp):40"""Reads from fp until it finds the line that ends the header.4142fp: open file object43"""44for line in fp:45if line.startswith('*** START OF THIS'):46break474849def process_line(line, hist):50"""Adds the words in the line to the histogram.5152Modifies hist.5354line: string55hist: histogram (map from word to frequency)56"""57# TODO: rewrite using Counter5859# replace hyphens with spaces before splitting60line = line.replace('-', ' ')61strippables = string.punctuation + string.whitespace6263for word in line.split():64# remove punctuation and convert to lowercase65word = word.strip(strippables)66word = word.lower()6768# update the histogram69hist[word] = hist.get(word, 0) + 1707172def most_common(hist):73"""Makes a list of word-freq pairs in descending order of frequency.7475hist: map from word to frequency7677returns: list of (frequency, word) pairs78"""79t = []80for key, value in hist.items():81t.append((value, key))8283t.sort()84t.reverse()85return t868788def print_most_common(hist, num=10):89"""Prints the most commons words in a histgram and their frequencies.9091hist: histogram (map from word to frequency)92num: number of words to print93"""94t = most_common(hist)95print('The most common words are:')96for freq, word in t[:num]:97print(word, '\t', freq)9899100def subtract(d1, d2):101"""Returns a dictionary with all keys that appear in d1 but not d2.102103d1, d2: dictionaries104"""105# TODO: reimplement using Counter106res = {}107for key in d1:108if key not in d2:109res[key] = None110return res111112113def total_words(hist):114"""Returns the total of the frequencies in a histogram."""115return sum(hist.values())116117118def different_words(hist):119"""Returns the number of different words in a histogram."""120return len(hist)121122123def random_word(hist):124"""Chooses a random word from a histogram.125126The probability of each word is proportional to its frequency.127"""128# TODO: rewrite using Counter129t = []130for word, freq in hist.items():131t.extend([word] * freq)132133return random.choice(t)134135136def main():137hist = process_file('158-0.txt', skip_header=True)138print('Total number of words:', total_words(hist))139print('Number of different words:', different_words(hist))140141t = most_common(hist)142print('The most common words are:')143for freq, word in t[0:20]:144print(word, '\t', freq)145146words = process_file('words.txt', skip_header=False)147148diff = subtract(hist, words)149print("The words in the book that aren't in the word list are:")150for word in diff.keys():151print(word, end=' ')152153print("\n\nHere are some random words from the book")154for i in range(100):155print(random_word(hist), end=' ')156157158if __name__ == '__main__':159main()160161162163164