Rcjp's Weblog

October 21, 2006

Autogenerated Poetry

Filed under: python — rcjp @ 7:23 pm

#
# Sucks in a story like Milton's Paradise Lost and generates text
# using the word probabilty from the sample document
# translated from Common Lisp Graham p.140
#
import re
import random
import textwrap

words = {}

def read_text(filename):
    """Fill the words dictionary with unique words from 'filename',
    each word entry is itself a dictionary of the words that follow
    that word and their frequency"""

    sampletext = open(filename, 'r').read()
    previous = '.'  #  i.e. the start of a sentence
    for wordpunc in sampletext.lower().split():
        # keep the punctuation as part of the word e.g. 'bye.'
        # now split 'bye.' as separate words 'bye' and '.'  note (\W)
        # returns the splitting characters as well as the fields so we
        # get 'bye' '.' '' and we want to ignore the last seperator 

        # [Note normally we'd want to avoid punctuation so should do   
        #  words=re.compile(r'[\w'-]+') then  
        #  for word in words.finditer(line)
        #  do something to word.group(0) ]
        for word in [w for w in re.split(r'(\W)', wordpunc) if w != '']:
            if previous not in words:
                words[previous] = {word : 1}
            else:
                words[previous][word] = words[previous].get(word, 0) + 1
            previous = word
    print 'Using a vocabulary of', len(words), 'words'

def format_word(word, previous):
    if previous == '.':
        word = ' ' + word.capitalize()
    else:
        if word.isalpha(): word = ' ' + word
    return word


def generate_text(nwords, previous = '.'):
    """Prints 'nwords' random words chosen according to statistically
    likely order calculated in read_text"""

    text = ''
    for n in xrange(nwords):
        nextwords = words[previous]
        count = 0
        pick = random.randint(0, sum(nextwords.values()))
        for word, freq in nextwords.iteritems():
            count += freq
            if count >= pick:
                text += format_word(word, previous)
                break
        previous = word
    return text

if __name__ == '__main__':
    read_text('c:/tmp/testwords')
    print textwrap.fill(generate_text(100), 50)

"""
e.g.
In [210]: read_text('c:/tmp/ParadiseLost.txt')
Using a vocabulary of 8993 words

In [211]: print textwrap.fill(generate_text(100),50)
 Farewell, immutable; i give not: conviction to
torment me, and worse. Satan only disagree of
servants feet. To that i boast what was old night,
metals of belial came; lest the government well
thou for which we seek some glade, or who wrong,
but he more soft and evil ruin. So erroneous there
dwell permits, and inward faculties, and bliss),
or, to know. With design to spend, till then to
the fields, when he ended; of nature him hither
thrust
"""

Advertisements

Leave a Comment »

No comments yet.

RSS feed for comments on this post.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Create a free website or blog at WordPress.com.

%d bloggers like this: