-
Notifications
You must be signed in to change notification settings - Fork 0
/
entro.py
56 lines (49 loc) · 2.01 KB
/
entro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#! /usr/local/bin/python
import re
from collections import Counter
from math import log
def build_freq_from_file(filename):
"""Calculate letter frequencies in a give file"""
letter_freq = Counter()
with open(filename, 'r') as input_file:
not_letters_n_space = re.compile("[^a-z ]+")
for line in input_file:
lower_case_line = line.strip().lower()
for letter in re.sub(not_letters_n_space, '', lower_case_line):
letter_freq[letter] += 1
input_file.close()
return letter_freq
def calculate_entropy_from_freq(rel_freq):
"""Assuming the frequencied to be probabilities calculate entropy"""
total = sum(rel_freq.values()) + 0.0
probabilities = {}
entropy = 0
for letter in rel_freq:
probabilities[letter] = rel_freq[letter]/total
entropy += probabilities[letter] * log(probabilities[letter], 2)
return -entropy
def kl_divergence(entropy, cross_entropy):
"""Kullback-Leibler divergence"""
# Kullback-Leibler divergence is a measure of how different two probability
# distributions are.
return cross_entropy - entropy
def calculate_cross_entropy_from_freqs(freq_a, freq_b):
"""cross entropy calculation"""
total_a = sum(freq_a.values()) + 0.0
total_b = sum(freq_b.values()) + 0.0
cross_entropy = 0.0
for letter in freq_b:
try:
cross_entropy += freq_a[letter] / total_a * \
log(freq_b[letter] / total_b, 2)
except ValueError:
cross_entropy += 0.0
return -cross_entropy
if __name__ == '__main__':
#AiW is egutenberg version of Alice in Wonderland
entropy_of_aiw = calculate_entropy_from_freq(build_freq_from_file("AiW.txt"))
#AoSH is egutenberg version of Adventures of Sherlock Holmes
print calculate_entropy_from_freq(build_freq_from_file("AoSH.txt"))
xentropy = calculate_cross_entropy_from_freqs(build_freq_from_file("AiW.txt"), \
build_freq_from_file("AoSH.txt"))
print kl_divergence(entropy_of_aiw, xentropy)