#Felix Zhang, Period 5 #Statistical part of speech tagging - This program tags words based on their most frequently occurring tag in the corpus. def main(): en = [] de = [] corpusdict = {} de = readcorpus("tiger_release_aug07.export") # hashtable = findpos(de) word = raw_input("Enter word: ") findsingleword(word,de) # if word in hashtable.keys(): # print hashtable[word] def findmaxprob(list): #finds the most frequently occurring tag max = 0 maxtag = "" for x in list: if x[1] > max: max = x[1] maxtag = x[0] return maxtag def findsingleword(word,de): #functions the same way as a hashtable, but only stores data for one word - more efficient for single-word inputs pos = [[]] for x in de: list = x.split('\t') if list[0][0] is not "#" and list[0][0] is not "%" and list[0][0] not in "1234567890": noblanks = removeblanks(list) encountered = False if word == noblanks[1] or word == noblanks[0]: if pos == [[]]: pos = [[noblanks[2],1]] for y in pos: if y != []: if y[0] == noblanks[2]: y[1] = y[1] + 1 encountered = True if encountered == False: pos.append([noblanks[2], 1]) print word, pos print "Most likely tag: " + findmaxprob(pos) def readcorpus(filename): s = open(filename).read().split('\n')[:-1] return s def findpos(readin): list = [] lookup = {} for x in readin: list = x.split('\t') #splits line into list if list[0][0] != "#" and list[0][0] != "%" and list[0][0] not in "1234567890": #gets rid of "noise" entries - garbled lines noblanks = removeblanks(list) register(lookup, noblanks[1], noblanks[2]) return lookup def removeblanks(list): #gets rid of blank entries when each line is converted into a list newlist = [] for x in list: if x != "": newlist.append(x) return newlist def register(table, key, value): #registers the German word in the hashtable as a key, with a list of pairs as values - first element of pair is part of speech tag, second is a numerical value indicating how frequently it appeared if key in table: invalues = False sets = table[key] for x in sets: if x[0] == value: x[1] = x[1] + 1 invalues = True if invalues == False: table[key].append([value,1]) else: table[key] = [[value,1]] main()