4
0 Comments

I’m working with some of the corpus materials from NLTK.

The point of the exercise is to alter the represent_word function to improve the score. The function consumes a word a string, and this word is either scrambled or unscrambled. The function produces a representation of the word, which is a list containing the following information:

  • Word length.
  • Number of vowels.
  • Number of consonants.
  • First and last letter of the word (these are always unscrambled).
  • A tuple of the most commonly used words from the corpus, who’s characters are also members of the given word input.

I have also tried analysing anagrams of prefixes and suffixes, but they don’t contribute anything to the score in the shadow of the most common words with common characters tuple.

I’m not sure why I can’t improve the score. I’ve even tried increasing dictionary size by importing words from another corpus.

The only section that can be altered here is the represent_word function and the definitions just above it. However, I’m including the entire source in case it might yield some insightful information to someones.

import nltk
import re

def word_counts(corpus, wordcounts = {}):
    """ Function that counts all the words in the corpus."""
    for word in corpus:
        wordcounts.setdefault(word.lower(), 0)
        wordcounts[word.lower()] += 1
    return wordcounts

JA_list = filter(lambda x: x.isalpha(), map(lambda x:x.lower(), nltk.corpus.gutenberg.words('austen-persuasion.txt')))
JA_freqdist=nltk.FreqDist(JA_list)
JA_toplist=sorted(JA_freqdist.items(),key=lambda x: x[1], reverse=True)[:0]
JA_topwords=[]
for i in JA_toplist:
    JA_topwords.append(i[0])

PP_list = filter(lambda x: x.isalpha(),map(lambda x:x.lower(), open("Pride and Prejudice.txt").read().split()))
PP_freqdist=nltk.FreqDist(PP_list)
PP_toplist=sorted(PP_freqdist.items(),key=lambda x: x[1], reverse=True)[:7]
PP_topwords=[]
for i in PP_toplist:
    PP_topwords.append(i[0])

uniquewords=[]
for i in JA_topwords:
    if i not in PP_topwords:
        uniquewords.append(i)
    else:
        continue
uniquewords.extend(PP_topwords)

def represent_word(word):
    def common_word(word):
        dictionary= uniquewords 
        findings=[]
        for string in dictionary:
            if all((letter in word) for letter in string):
                findings.append(string)
            else:
                False
        if not findings:
            return None
        else:
            return tuple(findings)      
    vowels = list("aeiouy") 
    consonants = list("bcdfghjklmnpqrstvexz") 
    number_of_consonants = sum(word.count(i) for i in consonants)
    number_of_vowels = sum(word.count(i) for i in vowels)
    split_word=list(word)
    common_words=common_word(word)
    return tuple([split_word[0],split_word[-1], len(split_word),number_of_consonants, number_of_vowels, common_words])

def create_mapping(words, mapping = {}):
    """ Returns a mapping of representations of words to the most common word for that representation. """
    for word in words:
        representation = represent_word(word)
        mapping.setdefault(representation, ("", 0))
        if mapping[representation][1] < words[word]:
            mapping[representation] = (word, words[word])
    return mapping

# Create a mapping of representations of the words in Persuasian by Jane Austen to use as a corpus
words = JA_freqdist
mapping = create_mapping(words)

# Load the words in the scrambled file
with open("Pdrie and Puicejdre.txt") as scrambled_file:
    scrambled_lines = [line.split() for line in scrambled_file if len(line.strip()) > 0 ]
    scrambled_words = [word.lower() for line in scrambled_lines for word in line]

# Descramble the words using the best mapping 
descrambled_words = []
for scrambled_word in scrambled_words:
    representation = represent_word(scrambled_word)
    if representation in mapping:
        descrambled_word = mapping[representation][0]
    else:
        descrambled_word = scrambled_word
    descrambled_words.append(descrambled_word)

# Load the original words
with open("Pride and Prejudice.txt") as original_file:
    original_lines = [line.split() for line in original_file if len(line.strip()) > 0 ]
    original_words = [word.lower() for line in original_lines for word in line]

# Make a list of word pairs from descrambled_words and original words
word_pairs = zip(descrambled_words, original_words)
# See if the words are the same
judgements = [descrambled_word == original_word for (descrambled_word, original_word) in word_pairs]
# Print the results
print("Correct: {0:.3%}".format(float(judgements.count(True))/len(judgements)))
Asked question