def gini(y):
ySet = set(y)
ret, n = 1, y.shape[0]
for yi in ySet:
ret -= (y[y==yi].shape[0]/n)**2
return ret
def CART(X, y):
# 若D中所有实例属于同一类$$C_k$$
if len(set(y))==1:
# 将类$$C_k$$作为该结点的类标记
return y[0]
bestGini = np.inf
# 对每个特征feature的每个取值value
for feature in range(X.shape[1]):
for value in set(X[:,feature]):
# 将X分为$$R_1$$和$$R_2$$两个集合
y1 = y[X[:,feature]<= value]
y2 = y[X[:,feature]> value]
# 计算$$R_1$$和$$R_2$$的基尼指数之和
sumGini = gini(y1) + gini(y2)
# 选择基尼指数计算结果最小的(feature, value)作为当前的最优划分
if sumGini < bestGini:
bestFeature, bestValue, bestGini = feature, value, sumGini
# 基于最优划分生成2个子结点,将数据分配到两个子结点中
node = {'feature':bestFeature,
'value':bestValue,
'left':CART(X[X[:,bestFeature]<= bestValue], y[X[:,bestFeature]<= bestValue]),
'right':CART(X[X[:,bestFeature]> bestValue], y[X[:,bestFeature]> bestValue])}
return node