def p_coin_flip(x):
    if x == "HEADS":
        return 1/2
    elif x == "TAILS":
        return 1/2
    else:
        return 0

print('p(HEADS):', p_coin_flip("HEADS"))
print('p(TAILS):', p_coin_flip("TAILS"))

s = 0
for outcome in ["HEADS", "TAILS"]:
    s += p_coin_flip(outcome)
print('     sum:', s)

p(HEADS): 0.5
p(TAILS): 0.5
     sum: 1.0

def p_die6(x):
    if x == 1:
        return 1/6
    elif x == 2:
        return 1/6
    elif x == 3:
        return 1/6
    elif x == 4:
        return 1/6
    elif x == 5:
        return 1/6
    elif x == 6:
        return 1/6
    else:
        return 0

print('p(1):', p_die6(1))
print('p(2):', p_die6(2))
print('p(3):', p_die6(3))
print('p(4):', p_die6(4))
print('p(5):', p_die6(5))
print('p(6):', p_die6(6))

s = 0
for outcome in range(1, 7): # 1, 2, 3, 4, 5, 6
    s += p_die6(outcome)
print(' sum:', s)

p(1): 0.16666666666666666
p(2): 0.16666666666666666
p(3): 0.16666666666666666
p(4): 0.16666666666666666
p(5): 0.16666666666666666
p(6): 0.16666666666666666
 sum: 0.9999999999999999

def is_valid_probability_distribution(p, outcomes):

    """
    Check if a probability distribution `p` with support `outcomes` is valid.
    """

    # sum up total probability - it should be exactly 1
    s = 0

    # consider all outcomes in the support
    for outcome in outcomes:

        # make sure the probability assigned is valid
        if p(outcome) < 0 or p(outcome) > 1:
            return False # if not, the distribution is invalid!

        s += p(outcome) # if it is, keep going - add up the contribution

    # check if the sum is close to 1 - we can have precision mistakes :(
    return abs(s - 1) < 0.00001

print(
    '         coin flip:',
    is_valid_probability_distribution(p_coin_flip, ["HEADS", "TAILS"])
)

print(
    'six-sided die roll:',
    is_valid_probability_distribution(p_die6, [1, 2, 3, 4, 5, 6])
)

         coin flip: True
six-sided die roll: True

import random

# create a mapping that has inputs 1...6 and all outputs are 0
p_y = dict.fromkeys([1, 2, 3, 4, 5, 6], 0)

N = 1000 # number of observations we will make

for i in range(N): # 0, ..., N-1
    coin_flip = random.randint(0, 1) # randomly choose 0 or 1
    if coin_flip == 0:
        die_roll = random.randint(1, 4) # randomly choose 1, 2, 3, or 4
    else:
        die_roll = random.randint(1, 6) # randomly choose 1, ..., 6
    p_y[die_roll] += 1 # count our observation

for key, value in p_y.items():
    print(key, value / N) # divide by N to get observed probability

1 0.202
2 0.196
3 0.232
4 0.199
5 0.085
6 0.086

import nltk
from nltk.corpus import brown

try:
    words = brown.words()
except:
    nltk.download('brown')
    words = brown.words()

words

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

unigram_counts = {} # dictionary, a mapping

for word in words: # iterate over all the words
    if word in unigram_counts: # if we have already seen the word
        unigram_counts[word] += 1 # add 1 to its count
    else: # if this is a new word
        unigram_counts[word] = 1 # add a new entry to the dictionary, value 1

# we can access any word's count like this:
print(f'  The: {unigram_counts["The"]}')
print(f'hello: {unigram_counts["hello"]}')

  The: 7258
hello: 4

# iterate over key-value pairs in dict
for key, value in unigram_counts.items():
    # re-assign the values with the normalized values
    # len(words) is the number of words in the corpus
    unigram_counts[key] = value / len(words)

def p_unigram(word):
    if word in unigram_counts:
        return unigram_counts[word]
    else:
        return 0

print(f'  p(The): {p_unigram("The")+p_unigram("the")}')
print(f'p(hello): {p_unigram("hello")+p_unigram("Hello")}')

  p(The): 0.06025790739171472
p(hello): 8.611840246918683e-06

unigram_support = list(unigram_counts.keys())
unigram_probs = list(unigram_counts.values())
is_valid_probability_distribution(p_unigram, unigram_support)

True

def generate_unigrams(sequence_length: int):
    sequence = [] # initialize empty sequence
    for i in range(sequence_length): # sample this many words
        choice = random.choices(
            unigram_support, # choose from the support
            weights=unigram_probs # given their probabilities
        )
        sequence += choice # add on to the sequence
    return sequence

' '.join(generate_unigrams(20))

'a all a Administration one subdivision , in critical displayed of Forsythe and example walked on Report when . Minutes'

# nested dictionary
bigram_counts = {}

# there will always be one fewer bigrams than unigrams
for i in range(len(words) - 1):

    # our bigram is "word1 word2"
    word1, word2 = words[i], words[i+1]

    # every word gets its own dictionary!
    if word1 not in bigram_counts:
        bigram_counts[word1] = {}

    # if it's a new bigram, set up a count
    if word2 not in bigram_counts[word1]:
        bigram_counts[word1][word2] = 1
    else: # otherwise just count
        bigram_counts[word1][word2] += 1

print(bigram_counts['Fulton']['County'])

6

s = 0 # total number of bigrams observed
for word1 in bigram_counts:
    for word2 in bigram_counts[word1]:
        s += bigram_counts[word1][word2]

bigram_joint_probs = {} # new dictionary of dictionaries
for word1 in bigram_counts: # for each word1
    bigram_joint_probs[word1] = {} # set up a new dictionary
    for word2 in bigram_counts[word1]: # for each word2
        # compute p(x,y) and store it
        bigram_joint_probs[word1][word2] = bigram_counts[word1][word2] / s

# p(Fulton County)
print(bigram_joint_probs['Fulton']['County'])

# earlier, we found that "Fulton County" occurs 6 times
# we also know that a corpus of N words has N-1 bigrams:
print(6 / (len(words) - 1))

5.1671085979825885e-06
5.1671085979825885e-06

i = 0 # easy way to just look at a few entries
for word2 in bigram_counts['Fulton']:
    print(word2, bigram_counts['Fulton'][word2])
    i += 1
    if i == 5:
        break

County 6
Superior 2
legislators 2
taxpayers 1
ordinary's 1

bigram_cond_probs = {} # new dictionary of dictionaries
for word1 in bigram_counts: # for each word1
    bigram_cond_probs[word1] = {} # set up a new dictionary
    s = 0
    for word2 in bigram_counts[word1]: # for each word2
        s += bigram_counts[word1][word2] # add up the counts
    for word2 in bigram_counts[word1]: # for each word2
        # compute p(x|y) and store it
        bigram_cond_probs[word1][word2] = bigram_counts[word1][word2] / s

# p(County|Fulton)
print(bigram_cond_probs['Fulton']['County'])

# sanity check! it should be:
# count(Fulton County) / count(Fulton)
count_fulton_county = bigram_counts['Fulton']['County']
count_fulton = sum(bigram_counts['Fulton'].values())
print(count_fulton_county / count_fulton)

0.35294117647058826
0.35294117647058826

def generate_bigrams(sequence_length: int):

    sequence = generate_unigrams(1) # initialize sequence with a unigram gen.

    for i in range(sequence_length - 1): # sample this many words

        last_word_dict = bigram_cond_probs[sequence[-1]]

        choice = random.choices(
            list(last_word_dict.keys()), # choose from the support
            weights=list(last_word_dict.values()) # given their probabilities
        )
        sequence += choice # add on to the sequence

    return sequence

' '.join(generate_bigrams(20))

'up good look at four hundred years , and affied unto him . `` I hold up to show a'

trigram_counts = {}

# count
for i in range(len(words) - 2):

    bigram_context = (words[i], words[i + 1])

    if bigram_context not in trigram_counts:
        trigram_counts[bigram_context] = {}

    third_word = words[i + 2]

    if third_word not in trigram_counts[bigram_context]:
        trigram_counts[bigram_context][third_word] = 1
    else:
        trigram_counts[bigram_context][third_word] += 1

# normalize
for bigram_context in trigram_counts:
    s = 0
    for third_word in trigram_counts[bigram_context]:
        s += trigram_counts[bigram_context][third_word]
    for third_word in trigram_counts[bigram_context]:
        trigram_counts[bigram_context][third_word] /= s

def generate_trigrams(sequence_length: int):

    sequence = generate_bigrams(2)

    for i in range(sequence_length - 2):

        bigram_context = (sequence[-2], sequence[-1])

        last_word_dict = trigram_counts

        choice = random.choices(
            list(last_word_dict[bigram_context].keys()),
            weights=list(last_word_dict[bigram_context].values())
        )

        sequence += choice

    return sequence

' '.join(generate_trigrams(20))

'of Af . Thus if the revenues from any kind of case studies of fluids for hydraulically operated equipment ,'

four_gram_counts = {}

# count
for i in range(len(words) - 3):

    trigram_context = (words[i], words[i + 1], words[i + 2])

    fourth_word = words[i + 3]

    if trigram_context not in four_gram_counts:
        four_gram_counts[trigram_context] = {}

    if fourth_word not in four_gram_counts[trigram_context]:
        four_gram_counts[trigram_context][fourth_word] = 1
    else:
        four_gram_counts[trigram_context][fourth_word] += 1

for trigram_context in four_gram_counts:
    s = 0
    for fourth_word in four_gram_counts[trigram_context]:
        s += four_gram_counts[trigram_context][fourth_word]

    for fourth_word in four_gram_counts[trigram_context]:
        four_gram_counts[trigram_context][fourth_word] /= s

def generate_four_grams(sequence_length: int):

    sequence = generate_trigrams(3)

    for i in range(sequence_length - 3):

        trigram_context = (sequence[-3], sequence[-2], sequence[-1])

        last_word_dict = four_gram_counts

        choice = random.choices(
            list(last_word_dict[trigram_context].keys()),
            weights=list(last_word_dict[trigram_context].values())
        )

        sequence += choice

    return sequence

' '.join(generate_four_grams(20))

"Iliad has two words for the Hound Of Heaven's Pursuit ) by judicial fiat . They didn't . The Department's"

$n$-gram Models¶

Mathematical definition of probability¶

Probability distributions¶

Properties that probability distributions must satisfy¶

Conditional probability¶

Joint probability¶

Learning a unigram model¶

Learning a bigram model¶

Joint probabilities from the counts¶

Conditional probabilities from the counts¶

Higher-order models¶

Concluding Remarks¶

Discussion Questions¶

Things for you to explore...¶

...if you are new to Python:¶

...if you are interested in machine learning and deep learning:¶

...if you are interested in natural language processing (NLP):¶

...if you want to learn about cutting-edge research¶