#Felix Zhang, Period 5 from heapq import * def main(): # input = raw_input("Enter German word, phrase, or sentence: "); input = "den Mann machen die kleinen Kinder" dictionarymeth(input) def dictionarymeth(input): d = {} readdict(d, 'dict.txt') translation = translate(d, input) def readdict(hashtable, filename): #Reads dictionary and stores entries in hashtable s = open(filename).read().split('\n')[:-1] for edge in s: lingset = edge.split(' ') german = lingset[0] english = lingset[1:len(lingset)] register(hashtable, german, english) def register(hashtable, key, value): #Add value to a hashtable key if key in hashtable: hashtable[key].append(value) else: hashtable[key] = [value] def translate(hashtable, input): #Runs all the components on the input words = input.split(' ') translated = [] tagged = pospeech(words) print "Part of speech tags: ",tagged attribs = properties(tagged,hashtable) print "Morphological analysis: ",attribs agree = nvagree(attribs) print "Disambiguated after noun-verb agreement: ",agree roots = lemmatize(agree) print "Lemmatized: ",roots translatedroots = lookup(roots, words, hashtable) print "Root translated: ", translatedroots print "NP Chunked English: ", chunkedtranslation = NPchunk(translatedroots, tagged, agree) print chunkedtranslation print "Inflected (only works before chunking): ", conjugated = inflect(words, translatedroots, agree) print "" print "Assigned an element type: " elemented = elementassign(chunkedtranslation) print elemented print "Assigned priority: " priority = priorityassign(elemented) print priority print "Rearranged to English structure: " print sorted(priority)#sorts the list by priority number - rearranges the sentence to English sentence structure def pospeech(words): #Tags part of speech based on sentence position and capitalization caps = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" wordtags = [] pronouns = "ich du er sie es ihn ihm ihr uns euch wir ihr ihnen" articles = "der die das den dem" #Pronouns and articles are few enough to be hardcoded # prepositions = "durch fuer gegen ohne um bei mit nach seit von zu" #Haven't coded these in yet for x in words: postag = [x, ""] if x[0] in caps and words.index(x) is not 0: #If a word in German is capitalized, it is a noun postag[1] = "nou" elif x.lower() in pronouns: postag[1] = "pn" elif words.index(x) > 0 and (wordtags[words.index(x) - 1][1] is "nou" or wordtags[words.index(x)-1][1] is "pn"): postag[1] = "ver" #A verb (at least in this program) always follows a noun or pronoun - except past tense, which sometimes works out # elif x.lower() in prepositions: # postag[1] = "prep" elif x.lower() in articles: postag[1] = "art" wordtags.append(postag) for x in wordtags: if x[1] is "": place = wordtags.index(x) if (place < len(wordtags) - 1 and wordtags[place + 1][1] is "nou") or (place > 0 and wordtags[place-1][1] is "ver"): x[1] = "adj" #If a word follows a verb or precedes a noun and wasn't already tagged, it's an adjective return wordtags def properties(words,d): #find linguistic properties like case and plurals and past tense dem = ["dem", [["dat","mas"],["dat","neu"]]] #Stores possible pairs for each article - "dem" can be dative and masculine OR dative and neuter der = ["der", [["nom","mas"],["dat","fem"]]] die = ["die", [["nom","fem"],["akk","fem"],["nom","pl"], ["akk", "pl"]]] das = ["das", [["nom","neu"],["akk","neu"]]] den = ["den", [["akk","mas"],["dat", "pl"]]] sentence = [] artset = [der, die, das, den, dem] for x in words: POS = x[1] if POS is "nou": modifier = [] article = [] posartpairs = [] posmodpairs = [] probable1 = [] if words.index(x) is not 0: prev = words[words.index(x) - 1] hasarticle = True while prev[1] != "art" and words.index(prev) is not 0: if prev[1] == "nou": hasarticle = False if prev[1] == "adj": modifier = prev #Stores the closest preceding adjective as the modifier prev = words[words.index(prev)-1] if hasarticle is True: article = prev if article != []: for y in artset: if article[0].lower() in y: for pair in y[1]: posartpairs.append(pair) if modifier != []: end = modifier[0][len(modifier[0])-2:len(modifier[0])] if end == "en": #Generates linguistic pairs based on adjective endings, similar to my artset above posmodpairs = ["akk","mas"],["nom","pl"],["akk","pl"],["dat","pl"],["dat","mas"],["dat","fem"],["dat","neu"] elif modifier[0][len(modifier[0])-1] is "e": posmodpairs = ["akk","fem"],["akk","neu"], ["nom","mas"],["nom","fem"],["nom","neu"] for b in posmodpairs: if b in posartpairs: probable1.append(b) else: probable1 = posartpairs if modifier != []: sentence.append([modifier, probable1]) sentence.append([x, probable1]) if POS is "ver": verb = x[0] tense = "" pairs = [] if verb[0:2] == "ge": #This prefix in German indicates a verb in the past perfect tense tense = "pperfect" else: tense = "pres" l = len(verb) end = verb[l-2:l] if end == "en": #More pairs based on endings - if a verb ends with "en", it can either be first person or third person plural pairs = [["1","pl"],["3","pl"]] elif end == "st": pairs = [["2","sing"]] if verb[len(verb)-1] is "t": pairs.append(["3","sing"]) pairs.append(["2","pl"]) elif verb[len(verb)-1] is "e": pairs = [["1","sing"]] pairs.append(tense) sentence.append([x, pairs]) return sentence def lemmatize(words): #breaks word down into root form - verbs into infinitive, plurals into singulars withroots = [] for x in words: rootposs = [] poss = x[1] root = "" word = x[0][0] if x[0][1] == "ver": tense = x[1][len(x[1]) - 1] if tense == "pperfect": root = word[word.rfind("ge")+2:word.rfind("t")]+"en" else: for y in poss[0:len(poss)-1]: #something to implement: account for vowel changes, e->i, a -> a umlaut, e->ie end = "" if y == ["3", "pl"] or y == ["1", "pl"]: end = "en" elif y == ["2", "sing"]: end = "st" elif y == ["3", "sing"] or y == ["2", "pl"]: end = "t" elif y == ["1", "sing"]: end = "e" root = word[0:word.rfind(end)] + "en" if root not in rootposs: rootposs.append(root) if x[0][1] == "adj": for y in poss: if y[0] == "dat" or y == ["akk", "mas"] or y[1] == "pl": root = word[0:word.rfind("en")] #removes plural/accusative masculine adjective ending # elif y[1] == "neu": #only applies to indefinite articles like "ein", which I haven't included # if words[words.index(x) - 1] == "arti": # root = word[0:word.rfind("es")] # else: else: root = word[0:word.rfind("e")] #removes singular ending if root not in rootposs: rootposs.append(root) if x[0][1] == "nou": for y in poss: if y[1] != "pl": #if the noun is not plural, there will be no suffixes, so the word can not be lemmatized any further root = word if root not in rootposs: rootposs.append(root) else: #remove any of the possible German plural endings if word.rfind("n") == len(word)-1: root = word[0:word.rfind("n")] if root not in rootposs: rootposs.append(root) if word.rfind("en") == len(word)-2: root = word[0:word.rfind("en")] if root not in rootposs: rootposs.append(root) if word.rfind("er") == len(word)-2: root = word[0:word.rfind("er")] if root not in rootposs: rootposs.append(root) if word.rfind("s") == len(word)-1: root = word[0:word.rfind("s")] if root not in rootposs: rootposs.append(root) if word.rfind("ern") == len(word)-3: root = word[0:word.rfind("ern")] if root not in rootposs: rootposs.append(root) withroots.append([word, rootposs]) return withroots def lookup(roots, words, hashtable): definitions = [] possdef = [] for x in words: inroots = False word = x for y in roots: if y[0] == word: possdef = [] inroots = True for z in y[1]: if z in hashtable.keys(): #look up the root word in hashtable, assuming it is a key trans = hashtable[z][0][0] if trans not in possdef: possdef.append(trans) if len(possdef) is 0: #if no definitions are found print "No definitions found for " + word + "." elif len(possdef) is 1: definitions.append([word, possdef[0]]) if inroots is False: if word in hashtable.keys(): definitions.append([word, hashtable[word][0][0]]) return definitions def findinattribs(attribs,word): for x in attribs: if x[0] == word: return x[1] def NPchunk(translated, tagged, attribs): #groups noun phrases into "chunks" - subject, direct object, verbs - used for English sentences (i.e. after the sentence has been translated, when chunking is actually useful) index = 0 chunks = [] while index < len(tagged): word = tagged[index] english = translated[index] props = findinattribs(attribs, word) enpair = [english[1], word[1], props] pos = word[1] chunk = [] if pos == "nou": chunk.append(enpair) if index > 0: temp = index - 1 while (tagged[temp][1] is "adj" or tagged[temp][1] is "art") and temp >= 0: #groups all articles and adjectives preceding the noun into the phrase chunk.append([translated[temp][1],tagged[temp][1]]) temp = temp - 1 chunk.reverse() chunks.append(chunk) elif pos != "art" and pos != "nou" and pos != "adj": chunks.append(enpair) index = index + 1 return chunks def nvagree(attribs): #checks if a noun "agrees" in person and case with a verb to reduce ambiguities verbindex = 0 verb = [] tense = "" for x in attribs: if x[0][1] == "ver": verb = x tense = verb[1][len(verb[1])-1] verbindex = attribs.index(verb) closest = findclosestsubjects(attribs, verbindex) #takes the two closest nouns to the verb - the one before and the one after possprops = [] if len(closest) == 1: number = "" attribs[findinlist(attribs,closest[0][0])][1] = [["nom", closest[0][1][0][1]]] number = closest[0][1][0][1] person = "" if closest[0][0][1] == "nou": person = "3" attribs[verbindex][1] = [[person, number], tense] return attribs else: for x in closest: person = "" number = "" temp = x pos = temp[0][1] if pos == "nou": person = "3" for y in temp: if y[0][0] != "pl": number = "sing" else: number = "pl" pair = [[person, number], tense] match = False for z in verb[1]: if pair == z: match = True attribs[findinlist(attribs, x[0])][1] = [["nom", x[1][0][1]]] attribs[verbindex][1] = pair attribs = eliminateother(attribs, x, closest) return attribs def eliminateother(attribs, sub, closest): #after determining the subject, the program removes the possibility that any other words in the sentence can be the subject for x in attribs: if x[0][1] == "nou" and x != sub: for y in x[1]: if y[0]== "nom": attribs[attribs.index(x)][1].remove(y) return attribs def findinlist(list, item): for x in list: if x[0] == item: return list.index(x) def findclosestsubjects(attribs, verbindex): decrease = verbindex possnouns = [] increase = verbindex while decrease > 0: decrease = decrease - 1 properties = attribs[decrease][1] poss1 = [] if attribs[decrease][0][1] == "nou": for x in properties: if x[0] == "nom": poss1.append(x) if len(poss1) > 0: possnouns.append([attribs[decrease][0], poss1]) break while increase < len(attribs): increase = increase + 1 properties = attribs[increase][1] poss2 = [] if attribs[increase][0][1] == "nou": for x in properties: if x[0] == "nom": poss2.append(x) if len(poss2) > 0: possnouns.append([attribs[increase][0], poss2]) break return possnouns def elementassign(chunked):#assigns one of five element types to a noun phrase - subject, direct object, indirect object, auxiliary verb, or main verb. tempelement = "" newchunks = [] for x in chunked: tempx = x if x[1] == "ver": if x[2][1] == "pres":#all verbs in the present tense will be main verbs tempelement = "mverb" else: tempelement = "auxverb"#auxiliary verbs don't even exist in my program yet elif len(x[0]) > 1 and x[len(x)-1][1] == "nou": if x[len(x)-1][2][0][0] == "akk":#the element assigner bases assignments only on one of the possibilities, should an ambiguity occur. tempelement = "dobj" elif x[len(x)-1][2][0][0] == "dat": tempelement = "iobj" elif x[len(x)-1][2][0][0] == "nom": tempelement = "sub" tempx.append(tempelement) newchunks.append(tempx) return newchunks def priorityassign(elemented): priority = "" assignments = open("priorities.txt").read().split("\n") for x in assignments: assignments[assignments.index(x)] = assignments[assignments.index(x)].split(" ") assignments.remove(assignments[len(assignments)-1]) for x in elemented: element = x[len(x)-1] for y in assignments: if y[0] == element: priority = y[1] x.insert(0, priority) #takes the priority from the configuration file, and appends it to the beginning of the chunk. return elemented def inflect(words, translated, attribs):#pretty simplistic, adds -s or -es to plurals and singular verbs print "" conjugated = [] for x in translated: needsconjug = False word = x[0] english = x[1] for y in attribs: if y[0][0] == word and y[0][1] != "adj": needsconjug = True if y[0][1] == "ver": for z in y[1][0:len(y[1])-1]: ending = '' if z == ["3","sing"]: if english[len(english)-1] == "s" or english[len(english)-1] == "x": ending = 'es' else: ending = 's' conjugated.append([english,z,english+ending]) elif y[0][1] == "nou": for z in y[1]: ending = '' if z[1] == "pl": ending = 's' conjugated.append([english,z,english+ending]) if needsconjug is False: conjugated.append([english,english]) for x in conjugated: print x, return conjugated main() #Things to do: #lemmatize - String verbs, vowel changes, imperfect #Grammar