import random class equation: #equation format: price delta (%) for a piece of news is equal to S*c^(-kt) #S = score, c = constant, k = constant c = 1 k = 1 def __init__(self,c=1,k=1): self.c = c self.k = k def calc(self,score,dt): return score*self.c**(-self.k*dt) def calcseries(self, series): return sum([self.calc(*dat) for dat in series]) def __str__(self): return "score*%s**(-%s*dt)"%(self.c,self.k) def regress(target, dataset,generations = 100, children = 8, keep = .5, debug = False): """Regress equations to match target. Returns the best equation found""" import time, re, datetime keep = int(str(keep*children).split('.')[0]) print keep m = 0 expr = re.compile('([0-9]{4})-([0-9]{2})-([0-9]{2}) ([0-9]{2}):([0-9]{2}):([0-9]{2})') for item in dataset: temp = time.mktime(map(int,expr.match(str(item['time'])).groups())+[0,0,0])/(60*60*24)#timestamp normalized to days item['timestamp'] = temp if temp > m: m = temp for item in dataset: item['differential'] = m-item['timestamp'] eqs = [[0,equation(random.random(),random.random())] for i in range(children)] given = [] for item in dataset: given.append([item['normalized_score'],item['differential']]) for gen in range(generations): for eq in eqs: try: eq[0] = abs(1-target/float(eq[1].calcseries(given))) except: eq[0] = target eqs.sort() eqs = eqs[:keep] new = [] for i in range(children-keep): parent1 = random.choice(eqs)[1] parent2 = random.choice(eqs)[1] new.append(spawn(parent1,parent2)) eqs += new if debug:print "Best: %.4f"%eqs[0][0] print eqs[0][1].calcseries(given) return eqs[0][1] def spawn(p1, p2, mutchance=.4,mutamt = .2): att = random.random() r = equation([p1.c,p2.c][att < .5],[p1.k,p2.k][att >= .5]) if random.random() < mutchance: if random.random() < .5: r.c = r.c*(1+[mutamt,-mutamt][random.random() < .5]) else: r.k = r.k*(1+[mutamt,-mutamt][random.random() < .5]) return [0,r]