markov.py

import math
import cson
import random
import re
import traceback

from itertools import tee


anre = re.compile(r'[^a-zA-Z1-9\_\x01]+')
empties = 0

unknown = re.compile(r'\ufffd+')

def unifix(s):
    try:
        return unicode(s, "utf-8")

    except TypeError:
        return s

def de_unifix(s):
    try:
        return str(s, "utf-8")

    except TypeError:
        return s

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

def decertcode(_string, encoding, replacer):
    if type(_string) is unicode:
        _string = _string.encode("utf-8")

    a = _string.decode(encoding, errors="replace")

    return unknown.sub(replacer, a)

def alphanumeric_only(s):
    if s:
        return anre.sub('', s).lower()

    else:
        return None

def formalize(s):
    s = s.strip()

    r = re.sub(r"(\A\w)|"+                  # start of string
             "(?<!\.\w)([\.?!] )\w|"+     # after a ?/!/. and a space,
                                          # but not after an acronym
             "\w(?:\.\w)|"+               # start/middle of acronym
             "(?<=\w\.)\w",               # end of acronym
             lambda x: x.group().upper(),
             s)

    if not r.endswith("."):
        r = u"{}.".format(decertcode(r, "ascii", "[?]"))

    return r

# Many thanks to Ned "nedbat" Batchelder for the SO solution which was basis for this function -- http://stackoverflow.com/a/3679747/5129091
def weighted_choice(choices):
    if any(a == choices for a in (tuple(), [], {})) :
        raise ValueError("The choice list is empty!")

    while True:
        if type(choices) in (tuple, list) or issubclass(type(choices), (tuple, list)):
            for b in tuple(choices):
                if type(b) is int:
                    break

                if len(b) not in (1, 2):
                    raise ValueError("Item size must be 1 (weight only, returns index) or 2 (item and weight, returns item); sizes were ({})!".format(", ".join(str(len(x) for x in choices))))

            if type(choices[0]) is int:
                choices = dict(enumerate(choices))
                break

            if len(choices[0]) == 2:
                choices = dict(choices)

            elif len(choices[0]) == 1:
                choices = dict(enumerate(choices))

        elif type(choices) is not dict:
            raise TypeError("The choices must be either a dict or an iterable!")

        break

    total = sum(w for c, w in choices.items())
    r = random.uniform(0, total)
    upto = 0

    for c, w in choices.items():
        if upto + w >= r:
            return c

        upto += w

    raise WeightError("No choices selected.")

def locate(i, aname, attribute):
    try:
        return i[[getattr(x, aname) for x in i].index(attribute)]

    except ValueError:
        return None

class MarkovChain(object):
    def __init__(self, initial=(), filename="_markov.cson"):
        self._data = dict(initial)
        self.filename = filename

    def names(self):
        return [a["normal"] for a in self._data.values()]

    def keywords(self):
        return self._data.keys()

    def __len__(self):
        return len(self._data)

    def receive(self, s, sep=" "):
        s = s.rstrip(".")
        p = pairwise(s.split(sep))

        if len(p) == 0:
            return

        for x in p:
            self._new(x)

        self._new((p[-1][1], None))

    def _new(self, pair):
        apair = tuple(alphanumeric_only(s) for s in pair)

        if not apair[0]:
            return

        a = self._data.get(apair[0], None)

        if a:
            if not apair[1]:
                return

            if apair[1] in a["choices"]:
                a["choices"][apair[1]] += 1

            else:
                a["choices"][apair[1]] = 1

        else:
            if apair[1]:
                self._data[apair[0]] = {
                    "choices": {
                        apair[1]: 1,
                    },
                    "normal": pair[0]
                }

            else:
                self._data[apair[0]] = {
                    "choices": {},
                    "normal": pair[0]
                }

    def get(self, keyword, sep=" ", first_only=True):
        self.read(self.filename)

        if self._data == {}:
            return 1

        keyword = alphanumeric_only(keyword.lower())

        if first_only:
            keyword = keyword.split(sep)[0]

        k = keyword.split(sep)[-1]

        iterated = unifix(keyword)
        ak = alphanumeric_only(k)

        if ak not in self._data:
            return 0

        a = self._data[ak]

        past = []

        while ak in self._data:
            if a["choices"] == {}:
                return formalize(u"{} {}".format(unifix(self._data[alphanumeric_only(keyword)]["normal"]), unifix(sep.join(past))))

            a = self._data[weighted_choice(a["choices"])]
            ak = alphanumeric_only(k)

            past.append(unifix(a["normal"]))

            if len(past) > 30:
                return formalize("{} {}".format(unifix(self._data[alphanumeric_only(keyword)]["normal"]), unifix(sep.join(past))))

    def random_markov(self, sep=" "):
        self.read(self.filename)

        k = self.keywords()

        if len(k) == 0:
            return 1

        c = random.choice(k)

        return self.get(c, sep)

    def read(self, filename):
        try:
            self._data = cson.load(open(filename))

        except BaseException:
            print "Error reading Markov chain from '{}', skipping:".format(filename)
            traceback.print_exc()

    def write(self, filename):
        open(filename, "w").write(cson.dumps(self._data))