def gRDA(X, y, feature):
# 计算X的每个特征可取到的值
a = set(X[:,feature])
# 计算数据集的经验熵
HD = H(y)
# 计算特征A对数据集D的经验条件熵H(D|A)
HDA = 0
for value in a:
yDi = y[X[:,feature]==value]
HDA += yDi.shape[0]/y.shape[0] * H(yDi)
return (HD - HDA)/HD
def C45(X, y, epsilon):
# 若D中所有实例属于同一类
if len(set(y))==1:
# 将类$$C_k$$作为该结点的类标记
return y[0]
# 若$$A=\emptyset$$
if X.shape[1] == 0:
# 实例数最大的类$$C_k$$作为该结点的类标记
return multi(y)
bestInfo = 0
# 计算A中各个特征对D的信息增益
for feature in range(X.shape[1]):
info = gRDA(X, y, feature)
# 选择信息特征最大的Ag
if svm(X, y, feature) > bestInfo:
bestInfo = info
bestfeature = feature
# 如果Ag的信息增益小于阈值$$\epsilon$$
if bestInfo < epsilon:
# 将D中实例数最大的类$$C_k$$作为该结点的类标记
return multi(y)
feature = bestfeature
ret = {'feature':feature}
# 对Ag的每一个可能的值ai
a = set(X[:, feature])
for ai in a:
yai = y[X[:,feature] == ai]
Xai = X[X[:,feature] == ai]
ret[ai] = C45(Xai, yai, epsilon)
return ret