Let's start with all of the code we've written so far and see how varying the order that the nodes are added allows us to create different "keys" for encryption
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
class TreeNode(object):
def __init__(self, key = None):
self.key = key
self.left = None
self.right = None
def __str__(self):
ret = ""
if self.key:
ret = "{}".format(self.key)
return ret
def compute_coords(self, x = [0], y = 0):
if self.left:
self.left.compute_coords(x, y-6)
self.x = x[0]
self.y = y
x[0] += 1
if self.right:
self.right.compute_coords(x, y-6)
def draw(self):
# Draw a dot
plt.scatter(self.x, self.y, 50, 'k')
# Draw some text indicating what the key is
plt.text(self.x-0.2, self.y-1.5, "{}".format(self))
# Offset in x
x1, y1 = self.x, self.y
if self.left:
# Draw a line segment from my node to this left child
x2, y2 = self.left.x, self.left.y
plt.plot([x1, x2], [y1, y2])
plt.text(0.5*(x1+x2), 0.5*(y1+y2), "0")
self.left.draw()
if self.right:
# Draw a line segment from my node to this right child
x2, y2 = self.right.x, self.right.y
plt.plot([x1, x2], [y1, y2])
plt.text(0.5*(x1+x2), 0.5*(y1+y2), "1")
self.right.draw()
def build_codebook(self, codebook, bstr):
"""
Parameters
----------
codebook: Dictionary of codebook we're filling in
bstr: list of characters
All the bits we've traversed to get to this node
"""
if self.key:
# This is a leaf node, we can complete the code for this key
# String characters together, e.g. ["0", "0", "1"] turns into "001"
codebook[self.key] = "".join(bstr)
else:
# An internal node with two children
# Explore the left subtree
bstr.append("0")
self.left.build_codebook(codebook, bstr)
bstr.pop()
# Explore the right subtree
bstr.append("1")
self.right.build_codebook(codebook, bstr)
bstr.pop()
class BCBTree(object):
def __init__(self, chars):
nodes = deque()
for c in chars:
nodes.append(TreeNode(c))
while len(nodes) > 1:
# Take first two out of line
n1 = nodes.popleft()
n2 = nodes.popleft()
# Merge the two by making a new node
n12 = TreeNode()
n12.left = n1
n12.right = n2
# Add new node to the back of the line
nodes.append(n12)
# Last node left is root
self.root = nodes.pop()
def draw(self):
if self.root:
self.root.compute_coords()
self.root.draw()
plt.axis("off")
plt.axis("equal")
def get_codebook(self):
"""
Return a dictionary whose key is a character and
whose value is the binary string to use to encode that character
"""
codebook = {}
if self.root:
self.root.build_codebook(codebook, [])
return codebook
def encode(self, s):
codebook = self.get_codebook()
res = ""
for c in s:
res += codebook[c]
return res
def decode(self, s):
## Redacted for exercise
def get_chars(seed = None):
"""
Return a list of the 26 lowercase characters, as well as the space,
in a random order.
Parameters
----------
seed: int
A seed for the random permutation
"""
chars = ["{}".format(chr(ord('a') + i)) for i in range(26)]
chars += [" ", "."]
if seed:
np.random.seed(seed)
chars = [chars[i] for i in np.random.permutation(len(chars))]
return chars
chars1 = get_chars(1)
print(chars1)
T1 = BCBTree(chars1)
plt.figure(figsize=(10, 6))
T1.draw()
ciphertext = T1.encode("algorithms are fun.")
print(ciphertext)
print(T1.decode(ciphertext))
chars2 = get_chars(2)
print(chars2)
T2 = BCBTree(chars2)
plt.figure(figsize=(10, 6))
T2.draw()
print(T2.decode(ciphertext))
There are actually good reasons we may want to have a tree not be balanced, particularly when we want to compress the input data, rather than encrypt it. In this case, we'll have characters that are used more often up closer to the root (e.g. "e" and "r"), and in exchange, we're willing to have characters that aren't used very often take a hit and be much further from the root (e.g. "q" and "z")
To build a Huffman tree, we need to count the occurrences of each character we want to encode. Below I wrote some code to count occurrences of each character in the words dictionary we used in assignment 2
def get_char_counts(do_plot = True):
"""
Return a dictionary whose key is a character and whose
value is the number of times that character shows up in a word
"""
fin = open("words.txt")
lines = [l.strip() for l in fin.readlines()]
counts = {}
for line in lines:
[word, count] = line.split()
count = int(count)
for c in word:
if not c in counts:
counts[c] = 0
counts[c] += count
maxc = 0
for key, count in counts.items():
maxc = max(maxc, count)
counts[" "] = maxc+1
counts["."] = int(maxc/5)
if do_plot:
keys = sorted(counts.keys())
values = [counts[k] for k in keys]
plt.figure(figsize=(10, 5))
plt.bar(keys, values)
plt.xticks(np.arange(len(keys)), keys)
plt.show()
return counts
counts = get_char_counts()
print(counts)
We're now going to use a data structure known as a priority queue or a "heap" to create a Huffman Tree. Unlike a deque, when we add things to a priority queue, they can actually jump to the front of the line. In this case, we're going to use the count of the character as its priority, as we want to merge characters with lower counts first when we're building the tree from the bottom up. Below is a simple example showing how the order the nodes come out is not necessarily the order they were added
from heapq import heappush, heappop
nodes = []
heappush(nodes, (2, 'z'))
heappush(nodes, (10, 'a'))
heappush(nodes, (1, 'q'))
print(heappop(nodes))
print(heappop(nodes))
print(heappop(nodes))
Below is the beginning of a Huffman tree implementation using a heap, which you will finish in an exercise
from heapq import heappush, heappop
# Inherit from the BCBTree class. The only thing that
# changes is the constructor
class HuffmanTree(BCBTree):
def __init__(self, counts):
nodes = []
for key, count in counts.items():
heappush(nodes, (count, TreeNode(key)))
while len(nodes) > 1:
(count1, n1) = heappop(nodes)
(count2, n2) = heappop(nodes)
## TODO: Student will fill this in
self.root = nodes[0][1]