JMK no matter what

일하기 싫어서 구현한 리그레션 트리

더하기 초간단 랜덤 포레스트.

lang:py
# -*- coding: utf-8 -*-

import logging
import math
import numpy as np
import random

class TreeNode(object):
    def __init__(self, test_feature, test_value, left, right):
        self.test_feature = test_feature
        self.test_value = test_value
        self.left = left
        self.right = right
    def predict(self, x):
        if self.test_feature != None:
            test = x[self.test_feature] <= self.test_value
            return (self.left if test else self.right).predict(x)
        return self.test_value

class Forest(object):
    def __init__(self, trees):
        self.trees = trees
    def predict(self, x):
        return np.mean([t.predict(x) for t in self.trees])

class MSE(object):
    def __init__(self, val):
        self.sqr_csum = (val ** 2).cumsum()
        self.csum = val.cumsum()
    @staticmethod
    def range_sum(a, b, cs):
        return cs[b] if a == 0 else cs[b] - cs[a-1]
    def sum(self, a, b):
        return self.range_sum(a, b, self.csum)
    def calc(self, a, b, x):
        return (self.range_sum(a, b, self.sqr_csum)
                - 2 * x * self.range_sum(a, b, self.csum)
                + (b - a + 1) * x * x) / (b - a + 1)

def learn_tree(X, y, m=None, samples=None, minsize=5, minsplit=5):
    N, p = X.shape
    samples = samples if samples != None else range(N)
    nodes = [0]
    def go(samples, depth=0):
        nodes[0] += 1
        N = len(samples)
        if N <= minsize:
            return TreeNode(None, np.mean(y[samples]), None, None)
        attributes = xrange(p) if m == None else random.sample(xrange(p), m)
        best_attribute, best_value, best_mse = None, None, np.inf
        for attrib in attributes:
            samples.sort(cmp=lambda a, b: cmp(X[a][attrib], X[b][attrib]))
            mse = MSE(y[samples])
            for at in xrange(minsplit,N-minsplit):
                if X[samples[at]][attrib] == X[samples[at+1]][attrib]: continue
                left_mse = mse.calc(0, at, mse.sum(0, at) / (at + 1))
                right_mse = mse.calc(at+1, N-1, mse.sum(at+1, N-1) / (N-at-1))
                cand_mse = left_mse + right_mse
                if cand_mse < best_mse:
                    best_mse = cand_mse
                    best_attribute = attrib
                    best_value = X[samples[at]][attrib]
        if best_attribute == None:
            return TreeNode(None, np.mean(y[samples]), None, None)
        left_branch = [s for s in samples if X[s][best_attribute] <= best_value]
        right_branch = [s for s in samples if X[s][best_attribute] > best_value]

        """
        logging.info("depth=%d, N=%d, best_attribute=%d, best_value=%g, "
                     "best_mse=%g, left_branch->%d, right_branch->%d",
                     depth,
                     N, best_attribute, best_value, best_mse, len(left_branch),
                     len(right_branch))
                     """
        return TreeNode(best_attribute, best_value,
                    go(left_branch, depth+1), go(right_branch, depth+1))

    ret = go(samples)
    logging.info("learn_tree: N=%d, p=%d, minsize=%d => %d nodes",
                 len(samples), p, minsize, nodes[0])
    return ret

def learn_forest(X, y, m=None, samples=None, minsize=5, minsplit=5,
                 r=0.5, trees=200):
    N, p = X.shape
    m = m or int(sqrt(p))
    Z = int(len(X) * r)
    return Forest([learn_tree(X, y, m, [random.randint(0, N-1)
                                        for i in xrange(Z)], minsize, minsplit)
                   for t in xrange(trees)])


def test_learn_tree():
    X = np.array([[0,1,2,3,4,5,6,7],
                  [4,4,4,6,6,6,6,6],
                  [1,2,3,7,6,5,1,2],
                  [1,2,3,3,3,4,5,6]]).T
    y = np.array([7,7,8,8,1,1,1,1])
    r = learn_tree(X, y)
    assert r.test_feature == 0
    assert r.test_value == 3
    assert r.left.test_feature == None
    assert r.right.test_feature == None
    assert r.left.test_value == 7.5
    assert r.right.test_value == 1
    z = [r.predict(row) for row in X]
    assert z == [7.5, 7.5, 7.5, 7.5, 1, 1, 1, 1], z

def test_mse():
    arr = np.random.random(100)
    mse = MSE(arr)
    tests = ([[0, 99], [0, 1], [0, 0], [99, 99], [98, 99]] +
             [random.sample(xrange(100), 2) for i in range(1000)])
    for a, b in tests:
        if a > b: a, b = b, a
        x = np.random.random()
        mean_sq_error = np.mean((arr[a:b+1] - x) ** 2)
        print arr[a:b+1]
        print x
        print mean_sq_error
        print mse.calc(a,b,x)
        assert abs(mean_sq_error - mse.calc(a, b, x)) < 0.000001

해놓고 보니 sum of squared error 로 쪼개야 하는데 mean 으로 쪼갰다. -.-; 고쳐야지..

2010-05-07 08:16:36 | JM | /writings/cs/ | 2 개의 댓글들
LIBe
2010-05-07 14:58:42
퍼가요~~~
JM
2010-05-13 00:32:25
@LIBe, 내 시간 나면 제대로 만들어서 공개하고픈 맘이 있네. 짜기가 어렵진 않은데 어째 공개된 건 없더라고. ㅡ,.ㅡ

댓글 남기기