#Felix Zhang, Period 5
from heapq import *

def main():
	input = raw_input("Enter German word (sentence functionality coming eventually): ");
	dictionarymeth(input)
#Corpus methods - I originally wanted to do some statistical methdos at the beginning of the year, but hit a dead end because there wasn't a perfect word-for-word alignment of German and English in my corpus
def corpusmeth(input):
	en = []
	de = []
	corpusdict = {}
	de = readcorpus("de-news-sample.de.al")
	en = readcorpus("de-news-sample.en.al")
	list = findincorpus(de, en, input)
	adddef(en, de, corpusdict, input, list)
	print corpusdict
def findincorpus(decorpus, encorpus, input):
	words = []
	for x in decorpus:
		if input in x:
			ensentence = encorpus[decorpus.index(x)]
			words.append(ensentence)
	return words
def readcorpus(filename):
	s = open(filename).read().split('\n')[:-1]
	return s

def adddef(encorpus, decorpus, corpusdict, deword, sentences):
	for m in sentences: 
		a = m.split(' ')
		desentence = decorpus[encorpus.index(m)].split(' ')
		deunique = removeduplicates(desentence)
		enunique = removeduplicates(a)
		for enword in enunique:
			addprob(corpusdict, deword, enword)
def removeduplicates(sentence):
	seen = []
	for x in sentence:
		if x not in seen:
			seen.append(x)
	return seen

def addprob(corpusdict, deword, enword):
	indict = False
	if deword in corpusdict.keys():
		for x in corpusdict[deword]:
			if x[0] == enword:
				indict = True
				x[1] = x[1] + 1
		if indict is False:
			corpusdict[deword].append([enword, 1])
	else:
		set = [[enword, 1]]
		corpusdict[deword] = set

#Dictionary methods - What I'm focusing on
def dictionarymeth(input):
	d = {}
	readdict(d, 'dict.txt')
	translation = translate(d, input)

def readdict(hashtable, filename): #Reads dictionary and stores entries in hashtable
	s = open(filename).read().split('\n')[:-1]
	for edge in s:
		lingset = edge.split(' ')
		german = lingset[0]
		english = lingset[1:len(lingset)]
		register(hashtable, german, english)

def register(hashtable, key, value): #Add value to a hashtable key
	if key in hashtable:
		hashtable[key].append(value)
	else:
		hashtable[key] = [value]

def translate(hashtable, input): #Runs all the compnents on the imput
	words = input.split(' ')
	translated = []
	tagged = pospeech(words)
	attribs = properties(tagged,hashtable)
	roots = lemmatize(attribs)
def pospeech(words): #Tags part of speech based on sentence position and capitalization
	caps = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	wordtags = []
	pronouns = "ich du er sie es ihn ihm ihr uns euch wir ihr ihnen"
	articles = "der die das den dem" #Pronouns and articles are few enough to be hardcoded
#	prepositions = "durch fuer gegen ohne um bei mit nach seit von zu" #Haven't coded these in yet
	for x in words:
		postag = [x, ""]
		if x[0] in caps and words.index(x) is not 0: #If a word in German is capitalized, it is a noun
			postag[1] = "nou"
		elif x.lower() in pronouns: 
			postag[1] = "pn"
		elif words.index(x) > 0 and (wordtags[words.index(x) - 1][1] is "nou" or wordtags[words.index(x)-1][1] is "pn"):
			postag[1] = "ver" #A verb (at least in this program) always follows a noun or pronoun
#		elif x.lower() in prepositions:
#			postag[1] = "prep"
		elif x.lower() in articles:
			postag[1] = "art"
		wordtags.append(postag)
	for x in wordtags:
		if x[1] is "":
			place = wordtags.index(x)
			if (place < len(wordtags) - 1 and wordtags[place + 1][1] is "nou") or (place > 0 and wordtags[place-1][1] is "ver"):
				x[1] = "adj"  #If a word follows a verb or precedes a noun and wasn't already tagged, it's an adjective
	return wordtags
	
def properties(words,d): #find linguistic properties like case and plurals and past tense

	
	dem = ["dem", [["dat,""mas"],["dat","neu"]]] #Stores possible pairs for each article - "dem" can be dative and masculine OR dative and neuter
	der = ["der", [["nom","mas"],["dat","fem"]]]
	die = ["die", [["nom","fem"],["akk","fem"],["nom","pl"], ["akk", "pl"]]]
	das = ["das", [["nom","neu"],["akk","neu"]]]
	den = ["den", [["akk","mas"],["dat", "pl"]]]

	pntable = {}

#	for key in d.keys():
#		for wordset in d[key]:
#			if wordset[1] == "pn":
#				register(pntable, key, wordset)

	artset = [der, die, das, die, den, dem]
	for x in words:
		POS = x[1]
		if POS is "nou":
			modifier = []
			article = []
			posartpairs = []
			posmodpairs = []
			probable1 = []
			
			if words.index(x) is not 0:
				prev = words[words.index(x) - 1]
				while prev[1] is not "art" and words.index(prev) is not 0:
					if prev[1] is "adj":
						modifier = prev #Stores the closet preceding adjective as the modifier
					prev = words[words.index(prev) -1]
				article = prev
			if article is not []:
				for y in artset:
					if article[0].lower() in y:
						for pair in y[1]:
							posartpairs.append(pair)
				if modifier != []:
					end = modifier[0][len(modifier[0])-2:len(modifier[0])]
					if end == "en": #Generates linguistic pairs based on adjective endings, simliar to my artset above
						posmodpairs = ["akk","mas"],["nom","pl"],["akk","pl"],["dat","pl"],["dat","mas"],["dat","fem"],["dat","neu"]
		
					elif modifier[0][len(modifier[0])-1] is "e":
						posmodpairs = ["akk","fem"],["akk","neu"], ["nom","mas"],["nom","fem"],["nom","neu"]
					for b in posmodpairs:
						if b in posartpairs:
							probable1.append(b)
				else:
					probable1 = posartpairs
			return x[0], probable1

		if POS is "ver":
			#something to try later: find the two nearest nouns and pronouns from right and left, check which is nominative, match up the number with the verb ending
			verb = x[0]
			tense = ""
			pairs = []
			
			if verb[0:2] is "ge": #This prefix in German indicates a verb in the past perfect tense
				tense = "pperfect"
			else:
				tense = "pres"
			l = len(verb)
			end = verb[l-2:l]
			print end
			if end == "en": #More pairs based on endings - if a verb ends with "en", it can either be first person or third person plural
				pairs = ["1","pl"],["3","pl"]
			elif end == "st":
				pairs = [["2","sing"]]
			if verb[len(verb)-1] is "t":
				pairs.append(["3","sing"])
				pairs.append(["2","pl"])
			elif verb[len(verb)-1] is "e":
				pairs = [["1","sing"]]
			return verb, pairs, tense
def lemmatizer(word)

main()
#Things to do:
#POS TAGGING - Statistical method? 
#LEMMATIZE