#Felix Zhang, Period 5 from heapq import * def main(): input = raw_input("Enter German word (sentence functionality coming eventually): "); dictionarymeth(input) #Corpus methods - I originally wanted to do some statistical methdos at the beginning of the year, but hit a dead end because there wasn't a perfect word-for-word alignment of German and English in my corpus def corpusmeth(input): en = [] de = [] corpusdict = {} de = readcorpus("de-news-sample.de.al") en = readcorpus("de-news-sample.en.al") list = findincorpus(de, en, input) adddef(en, de, corpusdict, input, list) print corpusdict def findincorpus(decorpus, encorpus, input): words = [] for x in decorpus: if input in x: ensentence = encorpus[decorpus.index(x)] words.append(ensentence) return words def readcorpus(filename): s = open(filename).read().split('\n')[:-1] return s def adddef(encorpus, decorpus, corpusdict, deword, sentences): for m in sentences: a = m.split(' ') desentence = decorpus[encorpus.index(m)].split(' ') deunique = removeduplicates(desentence) enunique = removeduplicates(a) for enword in enunique: addprob(corpusdict, deword, enword) def removeduplicates(sentence): seen = [] for x in sentence: if x not in seen: seen.append(x) return seen def addprob(corpusdict, deword, enword): indict = False if deword in corpusdict.keys(): for x in corpusdict[deword]: if x[0] == enword: indict = True x[1] = x[1] + 1 if indict is False: corpusdict[deword].append([enword, 1]) else: set = [[enword, 1]] corpusdict[deword] = set #Dictionary methods - What I'm focusing on def dictionarymeth(input): d = {} readdict(d, 'dict.txt') translation = translate(d, input) def readdict(hashtable, filename): #Reads dictionary and stores entries in hashtable s = open(filename).read().split('\n')[:-1] for edge in s: lingset = edge.split(' ') german = lingset[0] english = lingset[1:len(lingset)] register(hashtable, german, english) def register(hashtable, key, value): #Add value to a hashtable key if key in hashtable: hashtable[key].append(value) else: hashtable[key] = [value] def translate(hashtable, input): #Runs all the compnents on the imput words = input.split(' ') translated = [] tagged = pospeech(words) attribs = properties(tagged,hashtable) roots = lemmatize(attribs) def pospeech(words): #Tags part of speech based on sentence position and capitalization caps = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" wordtags = [] pronouns = "ich du er sie es ihn ihm ihr uns euch wir ihr ihnen" articles = "der die das den dem" #Pronouns and articles are few enough to be hardcoded # prepositions = "durch fuer gegen ohne um bei mit nach seit von zu" #Haven't coded these in yet for x in words: postag = [x, ""] if x[0] in caps and words.index(x) is not 0: #If a word in German is capitalized, it is a noun postag[1] = "nou" elif x.lower() in pronouns: postag[1] = "pn" elif words.index(x) > 0 and (wordtags[words.index(x) - 1][1] is "nou" or wordtags[words.index(x)-1][1] is "pn"): postag[1] = "ver" #A verb (at least in this program) always follows a noun or pronoun # elif x.lower() in prepositions: # postag[1] = "prep" elif x.lower() in articles: postag[1] = "art" wordtags.append(postag) for x in wordtags: if x[1] is "": place = wordtags.index(x) if (place < len(wordtags) - 1 and wordtags[place + 1][1] is "nou") or (place > 0 and wordtags[place-1][1] is "ver"): x[1] = "adj" #If a word follows a verb or precedes a noun and wasn't already tagged, it's an adjective return wordtags def properties(words,d): #find linguistic properties like case and plurals and past tense dem = ["dem", [["dat,""mas"],["dat","neu"]]] #Stores possible pairs for each article - "dem" can be dative and masculine OR dative and neuter der = ["der", [["nom","mas"],["dat","fem"]]] die = ["die", [["nom","fem"],["akk","fem"],["nom","pl"], ["akk", "pl"]]] das = ["das", [["nom","neu"],["akk","neu"]]] den = ["den", [["akk","mas"],["dat", "pl"]]] pntable = {} # for key in d.keys(): # for wordset in d[key]: # if wordset[1] == "pn": # register(pntable, key, wordset) artset = [der, die, das, die, den, dem] for x in words: POS = x[1] if POS is "nou": modifier = [] article = [] posartpairs = [] posmodpairs = [] probable1 = [] if words.index(x) is not 0: prev = words[words.index(x) - 1] while prev[1] is not "art" and words.index(prev) is not 0: if prev[1] is "adj": modifier = prev #Stores the closet preceding adjective as the modifier prev = words[words.index(prev) -1] article = prev if article is not []: for y in artset: if article[0].lower() in y: for pair in y[1]: posartpairs.append(pair) if modifier != []: end = modifier[0][len(modifier[0])-2:len(modifier[0])] if end == "en": #Generates linguistic pairs based on adjective endings, simliar to my artset above posmodpairs = ["akk","mas"],["nom","pl"],["akk","pl"],["dat","pl"],["dat","mas"],["dat","fem"],["dat","neu"] elif modifier[0][len(modifier[0])-1] is "e": posmodpairs = ["akk","fem"],["akk","neu"], ["nom","mas"],["nom","fem"],["nom","neu"] for b in posmodpairs: if b in posartpairs: probable1.append(b) else: probable1 = posartpairs return x[0], probable1 if POS is "ver": #something to try later: find the two nearest nouns and pronouns from right and left, check which is nominative, match up the number with the verb ending verb = x[0] tense = "" pairs = [] if verb[0:2] is "ge": #This prefix in German indicates a verb in the past perfect tense tense = "pperfect" else: tense = "pres" l = len(verb) end = verb[l-2:l] print end if end == "en": #More pairs based on endings - if a verb ends with "en", it can either be first person or third person plural pairs = ["1","pl"],["3","pl"] elif end == "st": pairs = [["2","sing"]] if verb[len(verb)-1] is "t": pairs.append(["3","sing"]) pairs.append(["2","pl"]) elif verb[len(verb)-1] is "e": pairs = [["1","sing"]] return verb, pairs, tense def lemmatizer(word) main() #Things to do: #POS TAGGING - Statistical method? #LEMMATIZE