import numpy as np #necessary basic tools...
import itertools

from nltk.corpus import words #training example...

from PIL import Image ##imaging...
import seaborn as sns
import matplotlib.pyplot as plt



words_list = words.words()
words_list = [x for x in words_list if len(x) >=2]


def asvoid(arr):
    """View the array as dtype np.void (bytes)
    This collapses ND-arrays to 1D-arrays, so you can perform 1D operations on them.
    https://stackoverflow.com/a/16216866/190597 (Jaime)"""    
    arr = np.ascontiguousarray(arr)
    return arr.view(np.dtype((np.void, arr.dtype.itemsize * arr.shape[-1])))

def find_index(arr, x):
    arr_as1d = asvoid(arr) # thank you to stack overflow (same link as above)
    x = asvoid(x)
    return np.nonzero(arr_as1d == x)[0]


class Ngram:
    def find_nplusonegram_ending_at_index(tokenized,i,n):
        return np.array(tokenized[i-n-1:i])

    def __init__(self, n,tokenizer,end_token,alphabet:list):
        self.tokenizer = tokenizer
        self.n = n
        self.end_token = end_token
        
        self.alphabet:np.array = (np.unique(np.array(alphabet+[end_token])))# initialize alphabet as an array of all possible tokens, **including the end token**. 

        self.nplusone_grams:np.array = np.array(([np.array(p) for p in itertools.product(self.alphabet, repeat=self.n+1)]))# initialize all possible ngrams by using the alphabet. 
        self.raw_count_of_each_n_plus_one_gram = np.zeros((self.nplusone_grams.shape[0]),dtype=np.int64)

    def tokenize(self,s:str):
        return ([self.end_token]*self.n+self.tokenizer(s)+[self.end_token])
        
    def update_from_file(self,path:str):
        ... # Extract the list, and use update_from_list

    def update_from_list(self,l:list[str]):
        # PARAM list must be a list of strings
        for w in l:# for each string in the list, 
            t = self.tokenize(w)
            for i in range(self.n+1,len(t)+1):# for each token in the string, starting at token index n, including the ending token,
                nplusone_gram = Ngram.find_nplusonegram_ending_at_index(tokenized=t,i=i,n=self.n)
                index = find_index(self.nplusone_grams,nplusone_gram)[0]
                self.raw_count_of_each_n_plus_one_gram[index] += 1  # update the count of the corresponding nplusone-gram
        

    def n_gram_count(self, n_gram):
        total = 0
        for tok in self.alphabet:
            index = find_index(self.nplusone_grams,n_gram+[tok])[0]
            total += self.raw_count_of_each_n_plus_one_gram[index]
        return total

    def f(self,t:str,w:list[str]):
        # t is a token
        # w is a list of tokens

        big_word_count = self.raw_count_of_each_n_plus_one_gram[find_index(self.nplusone_grams, np.array(list(w)+[t]))[0]]
        return 0 if big_word_count==0 else big_word_count/self.n_gram_count(list(w))#estimate f(t,w) based on counts of t.w and counts of w.


def probability(self, s:str):
    prod = 1 # the initial product value is 1
    tokenized = self.tokenize(s)# tokenize s.
    for i in range(self.n,len(tokenized)+1):# for each token t in s, starting at index n, excluding the ending token,
        t = tokenized[i]
        if t == self.end_token:
            break
        w = Ngram.find_nplusonegram_ending_at_index(tokenized,i,self.n-1) # given the preceding n-gram (:=w)
        prod *= self.f(t,w)
    return prod

Ngram.probability = probability


N = 2
END_TOKEN = '\x00'
PATH_TO_STRINGS_CSV = ...
ALPHABET = [c for c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-"]

def t(s:str): #string padding will be done 
    # example tokenizer
    l = [c for c in s] # tokenize by char
    assert all(c in ALPHABET for c in l) # assert that the input matches the alphabet
    return l

model_fr = Ngram(n=N,tokenizer=t,end_token=END_TOKEN,alphabet=ALPHABET) # generate new model from constructor, choosing n and a tokenizer (it must also be possible to `load` a model from csv)


model_fr.update_from_list(["chaussettes","chien"])# train the model using a list of strings


model_fr.probability("chien") #afaik with N=3 without training with a big CSV file, this should be 1/2 because the probability that i follows ch is 1/2, and all other probabilities in chien are 1.

0.16666666666666666


def plot(self):
    matArr = [[self.f(w[-1],w[:-1]) for w in [i for i in self.nplusone_grams if i[-1]==t]] for t in self.alphabet]


    fig, ax = plt.subplots(figsize=(10,10))  

    labels_y = self.alphabet
    labels_x = [w[:-1][0] for w in [i for i in self.nplusone_grams if i[-1] == ""]]

    ax = sns.heatmap(matArr, linewidth=1,xticklabels=1,yticklabels=1,ax=ax)
    ax.set_xlabel("Prefix $w$")
    ax.set_ylabel("Suffix $t$")
    ax.set_title("Probability of the $n+1$ gram $w.t$")
    ys = range(labels_y.shape[0])
    def format_y(tick_val, tick_pos):
        if int(tick_val) in ys:
            return labels_y[int(tick_val)]
        else:
            return ''


    ax.yaxis.set_major_formatter(format_y)

    xs= range(len(labels_x))
    def format_x(tick_val, tick_pos):
        if int(tick_val) in xs:
            return labels_x[int(tick_val)]
        else:
            return ''


    ax.xaxis.set_major_formatter(format_x)

    plt.show()

Ngram.plot = plot


model_en = Ngram(n=N,tokenizer=t,end_token=END_TOKEN,alphabet=ALPHABET)
model_en.update_from_list(words_list[:10000]) # train on all words in the English language


model_en.update_from_list(words_list[10000:20000]) # train on all words in the English language


model_en.update_from_list(words_list[20000:30000]) # train on all words in the English language


model_en.update_from_list(words_list[30000:40000])


model_en.update_from_list(words_list[40000:50000])


model_en.update_from_list(words_list[50000:100000])


model_en.update_from_list(words_list[100000:150000])


model_en.update_from_list(words_list[150000:200000])


model_en.update_from_list(words_list[200000:])


model_en.plot()


model_en.probability("Jezebelish")

1.2062902090706875e-14


model_fr.update_from_file(PATH_TO_STRINGS_CSV)# train the model using a csv file containing correct strings


PATH_TO_MODEL_JSON = ...

def store(self,path:str):
    ...#the file must contain:
    ...# all attributes of self except tokenizer (impossible to serialize afaik)
    # loaded models will GUESS the type of tokenization to be by-letter


def load_from(path:str)->Ngram:
    ...

Ngram.store = store
Ngram.load_from = load_from


model_fr.store(PATH_TO_MODEL_JSON) # store the model in a file
#do stuff with the model
model_fr = Ngram.load_from(PATH_TO_MODEL_JSON) # restart the model to where it was before, from memory.


def next_token(self, s:str):
    alphabet = self.alphabet if len(s) >= self.n else np.delete(self.alphabet,np.where(self.alphabet==self.end_token)[0]) # we want to generate words of size at least n


    probas = np.array([self.f(t,s[-self.n:]) for t in alphabet])

    choice = np.random.choice(alphabet,p=probas)

    return choice # search in self.alphabet for the token that is most likely using self.probability (you can use np.argmax, with, for t in self.alphabet, self.probability(s+t) as a function to maximize)



Ngram.next_token = next_token


def generate(self):
    s:str = "".join(np.random.choice(np.delete(self.alphabet,np.where(self.alphabet==self.end_token)[0]),1)) #pick a random 1gram from self.alphabet **that is not self.end_token**. For this you can use np.random for example.
    while True:
        t = self.next_token(s).lower()
        # print(f"Found token {t}, {t==self.alphabet[np.where(self.alphabet==self.end_token)[0]]}")
        if t==self.alphabet[np.where(self.alphabet==self.end_token)[0]]: # if t == self.end_token, but due to numpy encodings it is written differently
            break
        else:
            s += str(t)
    return s

Ngram.generate = generate


[model_en.generate() for i in range(100)]

['Leprin',
 'xiandvis',
 'Haliniscreximbihe',
 'Karcth',
 'juatalenorbeom',
 'hisede',
 'acomeabary',
 'Chminl',
 'blantrdarigaraclariericl',
 'Iteristed',
 'Dhonpsaly',
 'hesatoar',
 'ycoglifowegrmalig',
 'Xic',
 'ma',
 'rilacios',
 'Satumyl',
 'ym',
 'Guisemitovethicke',
 'Godag',
 'Jetotheaxidoparennapat',
 'weraraug',
 'Anve',
 'og',
 'Pogn',
 'Acly',
 'lesa',
 'Orct',
 'nintech',
 'howhyprom',
 'dabraury',
 'glve',
 'g',
 'hat',
 'Yudatos',
 'pene',
 'Te',
 'Bogans',
 'Pobakiocosesydoprowictalgra',
 'Grserifoclein',
 'Itedivab',
 'katerenthimarooaricaionicaed',
 'Eunatelzzatere',
 'Ateminiafine',
 'xparochnuapsprfuse',
 'sissy',
 'Kariarely',
 'Gume',
 'pado',
 'Tor',
 'g',
 'l',
 'Oghorlletioshicr',
 'Bretialemupher',
 'qungligeesad',
 'oniuroisedaurmisistare',
 'En',
 'Mopitexerapalicleotonteglilyp',
 'Gontesteratequloremary',
 'y',
 'Scachay',
 'Innesyhodaclios',
 'Tatoocopitrerene',
 'xtrargine',
 'er',
 'veadove',
 'ganffetikeline',
 'Khonathincorangr',
 'Gncrt',
 'Thn',
 'Xiscklundercrerplosecosm',
 'Qumend',
 'qunipehomalleneriortehed',
 'pionalun',
 'jupaee',
 'pll',
 'Gry',
 'esisimaiodly',
 'qublrotwlephapunu',
 'Se',
 'erosatenridme',
 'untintc',
 'wote',
 'grinostis',
 'keronvivinckor',
 'El',
 'Glatrorant',
 'Radrtotrenalvun',
 'Dassentereep',
 'Thitchesptueactafes',
 'Gohativinalycckeat',
 'Phite',
 'hini',
 'Unactttes',
 'Zorely',
 'isstoneadrerlimondenconteragudngegeuken',
 'ponal',
 'auliloxaknticoraromeralilous',
 'Yogizar',
 've']

Word Generation With N-Grams¶

1. The trainable model¶

2. Word generation¶