더하기 초간단 랜덤 포레스트.
lang:py
# -*- coding: utf-8 -*-
import logging
import math
import numpy as np
import random
class TreeNode(object):
def __init__(self, test_feature, test_value, left, right):
self.test_feature = test_feature
self.test_value = test_value
self.left = left
self.right = right
def predict(self, x):
if self.test_feature != None:
test = x[self.test_feature] <= self.test_value
return (self.left if test else self.right).predict(x)
return self.test_value
class Forest(object):
def __init__(self, trees):
self.trees = trees
def predict(self, x):
return np.mean([t.predict(x) for t in self.trees])
class MSE(object):
def __init__(self, val):
self.sqr_csum = (val ** 2).cumsum()
self.csum = val.cumsum()
@staticmethod
def range_sum(a, b, cs):
return cs[b] if a == 0 else cs[b] - cs[a-1]
def sum(self, a, b):
return self.range_sum(a, b, self.csum)
def calc(self, a, b, x):
return (self.range_sum(a, b, self.sqr_csum)
- 2 * x * self.range_sum(a, b, self.csum)
+ (b - a + 1) * x * x) / (b - a + 1)
def learn_tree(X, y, m=None, samples=None, minsize=5, minsplit=5):
N, p = X.shape
samples = samples if samples != None else range(N)
nodes = [0]
def go(samples, depth=0):
nodes[0] += 1
N = len(samples)
if N <= minsize:
return TreeNode(None, np.mean(y[samples]), None, None)
attributes = xrange(p) if m == None else random.sample(xrange(p), m)
best_attribute, best_value, best_mse = None, None, np.inf
for attrib in attributes:
samples.sort(cmp=lambda a, b: cmp(X[a][attrib], X[b][attrib]))
mse = MSE(y[samples])
for at in xrange(minsplit,N-minsplit):
if X[samples[at]][attrib] == X[samples[at+1]][attrib]: continue
left_mse = mse.calc(0, at, mse.sum(0, at) / (at + 1))
right_mse = mse.calc(at+1, N-1, mse.sum(at+1, N-1) / (N-at-1))
cand_mse = left_mse + right_mse
if cand_mse < best_mse:
best_mse = cand_mse
best_attribute = attrib
best_value = X[samples[at]][attrib]
if best_attribute == None:
return TreeNode(None, np.mean(y[samples]), None, None)
left_branch = [s for s in samples if X[s][best_attribute] <= best_value]
right_branch = [s for s in samples if X[s][best_attribute] > best_value]
"""
logging.info("depth=%d, N=%d, best_attribute=%d, best_value=%g, "
"best_mse=%g, left_branch->%d, right_branch->%d",
depth,
N, best_attribute, best_value, best_mse, len(left_branch),
len(right_branch))
"""
return TreeNode(best_attribute, best_value,
go(left_branch, depth+1), go(right_branch, depth+1))
ret = go(samples)
logging.info("learn_tree: N=%d, p=%d, minsize=%d => %d nodes",
len(samples), p, minsize, nodes[0])
return ret
def learn_forest(X, y, m=None, samples=None, minsize=5, minsplit=5,
r=0.5, trees=200):
N, p = X.shape
m = m or int(sqrt(p))
Z = int(len(X) * r)
return Forest([learn_tree(X, y, m, [random.randint(0, N-1)
for i in xrange(Z)], minsize, minsplit)
for t in xrange(trees)])
def test_learn_tree():
X = np.array([[0,1,2,3,4,5,6,7],
[4,4,4,6,6,6,6,6],
[1,2,3,7,6,5,1,2],
[1,2,3,3,3,4,5,6]]).T
y = np.array([7,7,8,8,1,1,1,1])
r = learn_tree(X, y)
assert r.test_feature == 0
assert r.test_value == 3
assert r.left.test_feature == None
assert r.right.test_feature == None
assert r.left.test_value == 7.5
assert r.right.test_value == 1
z = [r.predict(row) for row in X]
assert z == [7.5, 7.5, 7.5, 7.5, 1, 1, 1, 1], z
def test_mse():
arr = np.random.random(100)
mse = MSE(arr)
tests = ([[0, 99], [0, 1], [0, 0], [99, 99], [98, 99]] +
[random.sample(xrange(100), 2) for i in range(1000)])
for a, b in tests:
if a > b: a, b = b, a
x = np.random.random()
mean_sq_error = np.mean((arr[a:b+1] - x) ** 2)
print arr[a:b+1]
print x
print mean_sq_error
print mse.calc(a,b,x)
assert abs(mean_sq_error - mse.calc(a, b, x)) < 0.000001
해놓고 보니 sum of squared error 로 쪼개야 하는데 mean 으로 쪼갰다. -.-; 고쳐야지..


