Encryption

Let's start with all of the code we've written so far and see how varying the order that the nodes are added allows us to create different "keys" for encryption

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import deque

class TreeNode(object):
    def __init__(self, key = None):
        self.key = key
        self.left = None
        self.right = None
    
    def __str__(self):
        ret = ""
        if self.key:
            ret = "{}".format(self.key)
        return ret

    def compute_coords(self, x = [0], y = 0):
        if self.left:
            self.left.compute_coords(x, y-6)
        self.x = x[0]
        self.y = y
        x[0] += 1
        if self.right:
            self.right.compute_coords(x, y-6)

    def draw(self):
        # Draw a dot
        plt.scatter(self.x, self.y, 50, 'k')
        # Draw some text indicating what the key is
        plt.text(self.x-0.2, self.y-1.5, "{}".format(self))
        # Offset in x
        x1, y1 = self.x, self.y
        if self.left:
            # Draw a line segment from my node to this left child
            x2, y2 = self.left.x, self.left.y
            plt.plot([x1, x2], [y1, y2])
            plt.text(0.5*(x1+x2), 0.5*(y1+y2), "0")
            self.left.draw()
        if self.right:
            # Draw a line segment from my node to this right child
            x2, y2 = self.right.x, self.right.y
            plt.plot([x1, x2], [y1, y2])
            plt.text(0.5*(x1+x2), 0.5*(y1+y2), "1")
            self.right.draw()
    
    def build_codebook(self, codebook, bstr):
        """
        Parameters
        ----------
        codebook: Dictionary of codebook we're filling in
        
        bstr: list of characters
            All the bits we've traversed to get to this node
        """
        if self.key:
            # This is a leaf node, we can complete the code for this key
            # String characters together, e.g. ["0", "0", "1"] turns into "001"
            codebook[self.key] = "".join(bstr)
        else:
            # An internal node with two children
            # Explore the left subtree
            bstr.append("0")
            self.left.build_codebook(codebook, bstr)
            bstr.pop()
            # Explore the right subtree
            bstr.append("1")
            self.right.build_codebook(codebook, bstr)
            bstr.pop()

class BCBTree(object):
    def __init__(self, chars):
        nodes = deque()
        for c in chars:
            nodes.append(TreeNode(c))
        while len(nodes) > 1:
            # Take first two out of line
            n1 = nodes.popleft()
            n2 = nodes.popleft()
            # Merge the two by making a new node
            n12 = TreeNode()
            n12.left = n1
            n12.right = n2
            # Add new node to the back of the line
            nodes.append(n12)
        # Last node left is root
        self.root = nodes.pop()

    def draw(self):
        if self.root:
            self.root.compute_coords()
            self.root.draw()
        plt.axis("off")
        plt.axis("equal")
            
    def get_codebook(self):
        """
        Return a dictionary whose key is a character and
        whose value is the binary string to use to encode that character
        """
        codebook = {}
        if self.root:
            self.root.build_codebook(codebook, [])
        return codebook
    
    def encode(self, s):
        codebook = self.get_codebook()
        res = ""
        for c in s:
            res += codebook[c]
        return res
    
    def decode(self, s):
        ## Redacted for exercise
In [3]:
def get_chars(seed = None):
    """
    Return a list of the 26 lowercase characters, as well as the space,
    in a random order.

    Parameters
    ----------
    seed: int
        A seed for the random permutation
    """
    chars = ["{}".format(chr(ord('a') + i)) for i in range(26)]
    chars += [" ", "."]
    if seed:
        np.random.seed(seed)
        chars = [chars[i] for i in np.random.permutation(len(chars))]
    return chars
In [5]:
chars1 = get_chars(1)
print(chars1)
T1 = BCBTree(chars1)
plt.figure(figsize=(10, 6))
T1.draw()
['y', 'r', 't', 'u', 'o', 'd', 'w', 'k', 'v', 'e', 'c', 'x', 'g', 's', 'n', 'h', 'z', 'b', 'q', 'a', 'p', '.', ' ', 'j', 'i', 'm', 'l', 'f']
In [6]:
ciphertext = T1.encode("algorithms are fun.")
print(ciphertext)
print(T1.decode(ciphertext))
1101100101010001100010010000010101011100011010111110110110100110001111100011010111011011101
algorithms are fun.
In [7]:
chars2 = get_chars(2)
print(chars2)
T2 = BCBTree(chars2)
plt.figure(figsize=(10, 6))
T2.draw()
['b', 'a', 'o', 'j', 't', 'r', 'q', 'g', 'd', 'v', ' ', 'z', 'm', 'e', 'k', 'f', 'u', 'c', 'h', 'y', 'x', 's', 'l', 'w', '.', 'n', 'p', 'i']
In [8]:
print(T2.decode(ciphertext))
ypmta.ofnelyavlijks

Huffman Trees for String Compression

There are actually good reasons we may want to have a tree not be balanced, particularly when we want to compress the input data, rather than encrypt it. In this case, we'll have characters that are used more often up closer to the root (e.g. "e" and "r"), and in exchange, we're willing to have characters that aren't used very often take a hit and be much further from the root (e.g. "q" and "z")

Frequency Counts

To build a Huffman tree, we need to count the occurrences of each character we want to encode. Below I wrote some code to count occurrences of each character in the words dictionary we used in assignment 2

In [9]:
def get_char_counts(do_plot = True):
    """
    Return a dictionary whose key is a character and whose
    value is the number of times that character shows up in a word
    """
    fin = open("words.txt")
    lines = [l.strip() for l in fin.readlines()]
    counts = {}
    for line in lines:
        [word, count] = line.split()
        count = int(count)
        for c in word:
            if not c in counts:
                counts[c] = 0
            counts[c] += count
    maxc = 0
    for key, count in counts.items():
        maxc = max(maxc, count)
    counts[" "] = maxc+1
    counts["."] = int(maxc/5)
    if do_plot:
        keys = sorted(counts.keys())
        values = [counts[k] for k in keys]
        plt.figure(figsize=(10, 5))
        plt.bar(keys, values)
        plt.xticks(np.arange(len(keys)), keys)
        plt.show()
    return counts
counts = get_char_counts()
print(counts)
{'t': 247577342738, 'h': 106367962556, 'e': 349588141984, 'o': 228025627088, 'f': 61328927423, 'a': 243662684512, 'n': 207910712159, 'd': 107605388542, 'i': 223353030415, 'r': 201896673641, 's': 207080253606, 'b': 49798922187, 'y': 52941043438, 'w': 44294405401, 'u': 86950627146, 'm': 84155576549, 'l': 130649920346, 'v': 34402346309, 'c': 113913698859, 'p': 77553040250, 'g': 63045208347, 'k': 24380950863, 'x': 9151143994, 'j': 7637833834, 'z': 4192477980, 'q': 4218467887, ' ': 349588141985, '.': 69917628396}

We're now going to use a data structure known as a priority queue or a "heap" to create a Huffman Tree. Unlike a deque, when we add things to a priority queue, they can actually jump to the front of the line. In this case, we're going to use the count of the character as its priority, as we want to merge characters with lower counts first when we're building the tree from the bottom up. Below is a simple example showing how the order the nodes come out is not necessarily the order they were added

In [11]:
from heapq import heappush, heappop

nodes = []
heappush(nodes, (2, 'z'))
heappush(nodes, (10, 'a'))
heappush(nodes, (1, 'q'))
print(heappop(nodes))
print(heappop(nodes))
print(heappop(nodes))
(1, 'q')
(2, 'z')
(10, 'a')

Below is the beginning of a Huffman tree implementation using a heap, which you will finish in an exercise

In [ ]:
from heapq import heappush, heappop
# Inherit from the BCBTree class.  The only thing that
# changes is the constructor
class HuffmanTree(BCBTree): 
    def __init__(self, counts):
        nodes = []
        for key, count in counts.items():
            heappush(nodes, (count, TreeNode(key)))
        while len(nodes) > 1:
            (count1, n1) = heappop(nodes)
            (count2, n2) = heappop(nodes)
            ## TODO: Student will fill this in
        self.root = nodes[0][1]
In [ ]: