51hei.png (21.8 KB, 下載次數: 90)
下載附件
2020-12-18 16:42 上傳
- "#測試gini\n",
- "gini=calGini((l,r),classLabels)\n",
- "print(gini)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "def getBestSplit(dataSet,featureNumbers):\n",
- " '''\n",
- " 對于一個數據集,選擇featureNumber個特征進行簡單劃分,得到最好的特征和劃分結果\n",
- " args:\n",
- " dataSet:數據集,類型:list\n",
- " featureNumbers:選擇的特征值數,類型:int\n",
- " classLabels:所有分類,類型:list\n",
- " ''' \n",
- " \n",
- " #樣本數\n",
- " m=len(dataSet)\n",
- " if m==0:\n",
- " return None\n",
- " #樣本特征值數+1(因為最后有一個標簽)\n",
- " totalColumnNumber=len(dataSet[0])\n",
- " #隨機選擇的特征索引\n",
- " randomSelectedFeatures=[]\n",
- " \n",
- " \n",
- " \n",
- " #選擇數目必須在特征數目范圍內\n",
- " if totalColumnNumber-1>=featureNumbers: \n",
- " #借助這個變量防止選擇重復的特征進入\n",
- " indexList=list(range(totalColumnNumber-1)) \n",
- " for j in range(featureNumbers):\n",
- " #索引序列長度\n",
- " leftSize=len(indexList)\n",
- " #隨機數\n",
- " randIndex=random.randrange(leftSize)\n",
- " #索引學列隨機數處數據彈出,放入選擇特征列表\n",
- " origIndex=indexList.pop(randIndex)\n",
- " #存入的是原始數據特征索引\n",
- " randomSelectedFeatures.append(origIndex)\n",
- " else:\n",
- " randomSelectedFeatures=range(totalColumnNumber-1)#特征全部被選擇\n",
- " \n",
- " \n",
- " # print(\"current select features\")\n",
- " # print(randomSelectedFeatures)\n",
- "\n",
- " #當前數據集的標簽序列\n",
- " class_values=list(set(item[-1] for item in dataSet))\n",
- " \n",
- " #對于每個特征以及每個特征值進行簡單劃分\n",
- " #保留最小的基尼系數\n",
- " minGini=9999\n",
- " #存入最好的信息\n",
- " bestInfor={}\n",
- " #外層循環,對于每個特征\n",
- " for index in randomSelectedFeatures:\n",
- " #內層循環對于每個特征值\n",
- " tempFeatureValueList=list(set(item[index] for item in dataSet))\n",
- " #print(len(tempFeatureValueList))\n",
- " for tempValue in tempFeatureValueList:\n",
- " #簡單分類\n",
- " groups=simpleSplit(dataSet,index,tempValue) \n",
- " #print(\"currentIndex:%d,CurrentTempValue:%f\"%(index,tempValue))\n",
- " #計算基尼系數\n",
- " gini=calGini(groups,class_values)\n",
- " #print(\"computed gini:\",gini) \n",
- " if gini<minGini:\n",
- " minGini=gini\n",
- " #保存目前最后的信息\n",
- " bestInfor[\"index\"]=index#存入原來索引 \n",
- " bestInfor[\"indexValue\"]=tempValue\n",
- " bestInfor[\"groups\"]=groups\n",
- " bestInfor[\"gini\"]=gini\n",
- " \n",
- " return bestInfor"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "52 0.017\n"
- ]
- }
- ],
- "source": [
- "#測試最好分類函數\n",
- "bestInfor=getBestSplit(dataSet,3)\n",
- "print(bestInfor[\"index\"],bestInfor[\"indexValue\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "def terminalLabel(subSet):\n",
- " '''\n",
- " 樹葉點對應的標簽\n",
- " args:\n",
- " subSet:當前數據集,最后列是標簽列,類型:list\n",
- " returns:\n",
- " 當前列中最多的標簽,類型:原標簽類型\n",
- " '''\n",
- " #得到最后一列\n",
- " labelList=[item[-1] for item in subSet]\n",
- " #max函數,key后是函數,代表對前面的進行那種運算,這里是技術\n",
- " #max返回值是第一個參數,這里set是把labelList轉換成集合,即去掉重復項\n",
- " #key:相當于循環調用labelList.count(set(labelList))中的每個元素,然后max取得最大值\n",
- " #返回set(labelList)中對應最大的那個標簽\n",
- " return max(set(labelList), key=labelList.count) # 輸出 subSet 中出現次數較多的標簽 \n",
- "\n",
- " #下面的寫法也是成立的,利用lambda表達式,表達式中x從全面取,這種寫法可能更好理解些\n",
- " #return max(set(labelList), key=lambda x:labelList.count(x)) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "R\n"
- ]
- }
- ],
- "source": [
- "#測試\n",
- "label=terminalLabel(l)\n",
- "print(label)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "#對得到的最好分類信息進行分割\n",
- "def split(node, max_depth, min_size, n_features, depth): # 創建子分割器 遞歸分類 直到分類結束\n",
- " '''\n",
- " :param node: 節點,類型:字典\n",
- " bestInfor[\"index\"]=index#存入原來索引 \n",
- " bestInfor[\"indexValue\"]=tempValue\n",
- " bestInfor[\"groups\"]=groups\n",
- " bestInfor[\"gini\"]=gini\n",
- " :param max_depth: 最大深度,int\n",
- " :param min_size: 最小,int\n",
- " :param n_features: 特征選取個數,int\n",
- " :param depth: 深度,int\n",
- " :return:\n",
- " '''\n",
- " left, right = node['groups']\n",
- " del (node['groups'])\n",
- "\n",
- " if not left or not right: # 如果只有一個子集\n",
- " node['left'] = node['right'] = terminalLabel(left + right) # 投票出類型\n",
- " return\n",
- "\n",
- " if depth >= max_depth: # 如果即將超過\n",
- " node['left'], node['right'] = terminalLabel(left), terminalLabel(right) # 投票出類型\n",
- " return\n",
- "\n",
- " if len(left) <= min_size: # 處理左子集\n",
- " node['left'] = terminalLabel(left)\n",
- " else:\n",
- " node['left'] = getBestSplit(left, n_features) # node['left']是一個字典,形式為{'index':b_index, 'value':b_value, 'groups':b_groups},所以node是一個多層字典\n",
- " split(node['left'], max_depth, min_size, n_features, depth + 1) # 遞歸,depth+1計算遞歸層數\n",
- "\n",
- " if len(right) <= min_size: # 處理右子集\n",
- " node['right'] = terminalLabel(right)\n",
- " else:\n",
- " node['right'] = getBestSplit(right, n_features)\n",
- " split(node['right'], max_depth, min_size, n_features, depth + 1)\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [],
- "source": [
- "#構建一個決策樹\n",
- "def buildTree(train, max_depth, min_size, n_features):\n",
- " '''\n",
- " 創建一個決策樹\n",
- " :param train: 訓練數據集\n",
- " :param max_depth: 決策樹深度不能太深 不然容易導致過擬合\n",
- " :param min_size: 葉子節點的大小\n",
- " :param n_features: 選擇的特征的個數\n",
- " :return\n",
- " root 返回決策樹\n",
- " '''\n",
- " root = getBestSplit(train, n_features) # 獲取樣本數據集\n",
- " split(root, max_depth, min_size, n_features, 1) # 進行樣本分割,構架決策樹\n",
- " return root # 返回決策樹\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'index': 55, 'indexValue': 0.0114, 'gini': 0.0, 'left': {'index': 35, 'indexValue': 0.2288, 'gini': 0.0, 'left': 'R', 'right': {'index': 33, 'indexValue': 0.2907, 'gini': 0.0, 'left': 'R', 'right': {'index': 58, 'indexValue': 0.0057, 'gini': 0.0, 'left': {'index': 12, 'indexValue': 0.0493, 'gini': 0.0, 'left': 'R', 'right': 'R'}, 'right': 'R'}}}, 'right': {'index': 54, 'indexValue': 0.0063, 'gini': 0.0, 'left': {'index': 21, 'indexValue': 0.8384, 'gini': 0.0, 'left': 'M', 'right': 'M'}, 'right': {'index': 32, 'indexValue': 0.558, 'gini': 0.0, 'left': 'M', 'right': {'index': 58, 'indexValue': 0.0332, 'gini': 0.0, 'left': 'M', 'right': 'M'}}}}\n"
- ]
- }
- ],
- "source": [
- "#測試決策樹\n",
- "#選擇一個子集\n",
- "s=putBackSample(dataSet,10)\n",
- "tempTree=buildTree(s,10,1,3)\n",
- "print(tempTree)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [],
- "source": [
- "#根據決策樹進行預測\n",
- "def predict(node, row): # 預測模型分類結果\n",
- " '''\n",
- " 在當前節點進行預測,row是待預測樣本\n",
- " args:\n",
- " node:樹節點\n",
- " row:待分類樣本\n",
- " return:\n",
- " 分類標簽\n",
- " '''\n",
- " if row[node['index']] < node['indexValue']:\n",
- " if isinstance(node['left'], dict): # isinstance 是 Python 中的一個內建函數。是用來判斷一個對象是否是一個已知的類型。\n",
- " return predict(node['left'], row)\n",
- " else:\n",
- " return node['left']\n",
- " else:\n",
- " if isinstance(node['right'], dict):\n",
- " return predict(node['right'], row)\n",
- " else:\n",
- " return node['right']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "R R\n"
- ]
- }
- ],
- "source": [
- "#測試下\n",
- "label=predict(tempTree,s[0])\n",
- "print(label,s[0][-1])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [],
- "source": [
- "#多個樹的決策,多數服從少數\n",
- "def baggingPredict(trees, row):\n",
- " \"\"\"\n",
- " 多個樹的決策,多數服從少數\n",
- " Args:\n",
- " trees 決策樹的集合\n",
- " row 測試數據集的每一行數據\n",
- " Returns:\n",
- " 返回隨機森林中,決策樹結果出現次數做大的\n",
- " \"\"\"\n",
- "\n",
- " # 使用多個決策樹trees對測試集test的第row行進行預測,再使用簡單投票法判斷出該行所屬分類\n",
- " predictions = [predict(tree, row) for tree in trees]\n",
- " return max(set(predictions), key=predictions.count)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [],
- "source": [
- "def subSample(dataSet, ratio): \n",
- " '''\n",
- " 按比例隨機抽取數據,有重復抽樣\n",
- " args:\n",
- " dataSet:數據集,類型:list\n",
- " ratio:0-1之間的數\n",
- " '''\n",
- " if ratio<0.0:\n",
- " return None\n",
- " if ratio>=1:\n",
- " return dataSet\n",
- " sampleNumber=int(len(dataSet)*ratio)\n",
- " subSet=putBackSample(dataSet,sampleNumber)\n",
- " return subSet"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "41\n"
- ]
- }
- ],
- "source": [
- "#測試\n",
- "subSet=subSample(dataSet,0.2)\n",
- "print(len(subSet))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [],
- "source": [
- "#隨機森林主函數\n",
- "def buildRandomForest(train, max_depth=10, min_size=1, sample_size=0.2, n_trees=10, n_features=3):\n",
- " \"\"\"\n",
- " random_forest(評估算法性能,返回模型得分)\n",
- " Args:\n",
- " train 訓練數據集,類型:list \n",
- " max_depth 決策樹深度不能太深,不然容易導致過擬合\n",
- " min_size 葉子節點的大小\n",
- " sample_size 訓練數據集的樣本比例,0,1之間的數\n",
- " n_trees 決策樹的個數\n",
- " n_features 選取的特征的個數\n",
- " Returns:\n",
- " trees:樹序列\n",
- " \"\"\"\n",
- "\n",
- " trees = list()\n",
- " # n_trees 表示決策樹的數量\n",
- " for i in range(n_trees):\n",
- " # 隨機抽樣的訓練樣本, 隨機采樣保證了每棵決策樹訓練集的差異性\n",
- " sample = subSample(train, sample_size)\n",
- " # 創建一個決策樹\n",
- " tree = buildTree(sample, max_depth, min_size, n_features)\n",
- " trees.append(tree)\n",
- " return trees\n",
- " \n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [],
- "source": [
- "def predictByForest(trees,test):\n",
- " '''\n",
- " predictions 每一行的預測結果,bagging 預測最后的分類結果\n",
- " '''\n",
- " # 每一行的預測結果,bagging 預測最后的分類結果\n",
- " predictions = [baggingPredict(trees, row) for row in test]\n",
- " return predictions"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [],
- "source": [
- "def calQuota(predictions,labelClass,OrigClassLabels):\n",
- " '''\n",
- " 計算分類指標\n",
- " args:\n",
- " predictions:預測值,類型:list\n",
- " labelClass:真實標簽,類型:list\n",
- " OrigClassLabels:數據可能的標簽庫,一個正例一個負例標簽\n",
- " '''\n",
- " \n",
- " Pos=OrigClassLabels[0]\n",
- " Nev=OrigClassLabels[1] \n",
- " #真正例 \n",
- " #TP=len([item for item in labelClass if item==Pos and predictions[labelClass.index(item)]==Pos])\n",
- " TP=0\n",
- " TN=0\n",
- " FP=0\n",
- " FN=0\n",
- " for j in range(len(predictions)): \n",
- " if predictions[j]==Pos and labelClass[j]==Pos:\n",
- " TP+=1\n",
- " if predictions[j]==Nev and labelClass[j]==Nev:\n",
- " TN+=1\n",
- " if predictions[j]==Pos and labelClass[j]==Nev:\n",
- " FP+=1\n",
- " if predictions[j]==Nev and labelClass[j]==Pos:\n",
- " FN+=1\n",
- "# #真負例,下面的做法不行,原因是index可能得到不同的索引\n",
- "# TN=len([item for item in labelClass if item==Nev and predictions[labelClass.index(item)]==Nev])\n",
- "# #偽正例\n",
- "# FP=len([item for item in labelClass if item==Nev and predictions[labelClass.index(item)]==Pos])\n",
- "# #偽負例\n",
- "# FN=len([item for item in labelClass if item==Pos and predictions[labelClass.index(item)]==Nev])\n",
- "\n",
- " #Recall,TruePosProp=TP/(TP+FN)#識別的正例占整個正例的比率\n",
- " #FalsPosProp=FP/(FP+TN)#識別的正例占整個負例的比率\n",
- " #Precition=TP/(TP+FP)#識別的正確正例占識別出所有正例的比率\n",
- " \n",
- " return TP,TN,FP,FN"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [],
- "source": [
- "#測試下:\n",
- "trees=buildRandomForest(dataSet)\n",
- "testSet=nonPutBackSample(dataSet,100)\n",
- "prediction=predictByForest(trees,testSet)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(44, 39, 12, 5)\n"
- ]
- }
- ],
- "source": [
- "labelClass=[item[-1] for item in testSet]\n",
- "\n",
- "tp=calQuota(prediction,labelClass,list(classLabels))\n",
- "print(tp)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [],
- "source": [
- "def accuracy( predicted,actual): \n",
- " correct = 0\n",
- " for i in range(len(actual)):\n",
- " if actual[i] == predicted[i]:\n",
- " correct += 1\n",
- " return correct / float(len(actual)) * 100.0\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "83.0\n"
- ]
- }
- ],
- "source": [
- "a=accuracy(prediction,labelClass)\n",
- "print(a)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [],
- "source": [
- "def createCrossValideSets(trainSet,n_folds,bPutBack=True):\n",
- " '''\n",
- " 產生交叉驗證數據集\n",
- " Args:\n",
- " dataset 原始數據集 \n",
- " n_folds 數據的份數,數據集交叉驗證的份數,采用無放回抽取\n",
- " bPutBack 是否放回\n",
- " '''\n",
- " subSetsList=[]\n",
- " subLen=int(len(trainSet)/n_folds)\n",
- " if bPutBack:\n",
- " for j in range(n_folds):\n",
- " subSet=putBackSample(trainSet,subLen)\n",
- " subSetsList.append(subSet)\n",
- " else:\n",
- " for j in range(n_folds):\n",
- " subSet=nonPutBackSample(trainSet,subLen)\n",
- " subSetsList.append(subSet)\n",
- " return subSetsList"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [],
- "source": [
- "def randomForest(trainSet,testSet,max_depth=10, min_size=1, sample_size=0.2, n_trees=10, n_features=3):\n",
- " '''\n",
- " 構造隨機森林并測試\n",
- " Args:\n",
- " train 訓練數據集,類型:list \n",
- " testSet 測試集,類型:list\n",
- " max_depth 決策樹深度不能太深,不然容易導致過擬合\n",
- " min_size 葉子節點的大小\n",
- " sample_size 訓練數據集的樣本比例,0,1之間的數\n",
- " n_trees 決策樹的個數\n",
- " n_features 選取的特征的個數\n",
- " Returns:\n",
- " predition 測試集預測值,類型:list\n",
- " '''\n",
- " trees=buildRandomForest(trainSet,max_depth, min_size, sample_size, n_trees, n_features)\n",
- " predition=predictByForest(trees,testSet)\n",
- " return predition"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [],
- "source": [
- "def evaluteAlgorithm(trainSet,algorithm,n_folds,*args):\n",
- " '''\n",
- " 評價算法函數\n",
- " Args:\n",
- " dataset 原始數據集\n",
- " algorithm 使用的算法\n",
- " n_folds 數據的份數,數據集交叉驗證的份數,采用無放回抽取\n",
- " *args 其他的參數\n",
- " Returns:\n",
- " scores 模型得分\n",
- " '''\n",
- " folds = createCrossValideSets(trainSet, n_folds)\n",
- " scores = list()\n",
- " # 每次循環從 folds 從取出一個 fold 作為測試集,其余作為訓練集,遍歷整個 folds ,實現交叉驗證\n",
- " for fold in folds:\n",
- " train_set = list(folds)\n",
- " train_set.remove(fold)\n",
- " # 將多個 fold 列表組合成一個 train_set 列表, 類似 union all\n",
- " \"\"\"\n",
- " In [20]: l1=[[1, 2, 'a'], [11, 22, 'b']]\n",
- " In [21]: l2=[[3, 4, 'c'], [33, 44, 'd']]\n",
- " In [22]: l=[]\n",
- " In [23]: l.append(l1)\n",
- " In [24]: l.append(l2)\n",
- " In [25]: l\n",
- " Out[25]: [[[1, 2, 'a'], [11, 22, 'b']], [[3, 4, 'c'], [33, 44, 'd']]]\n",
- " In [26]: sum(l, [])\n",
- " Out[26]: [[1, 2, 'a'], [11, 22, 'b'], [3, 4, 'c'], [33, 44, 'd']]\n",
- " \"\"\"\n",
- " train_set = sum(train_set, [])\n",
- " test_set = list()\n",
- " # fold 表示從原始數據集 dataset 提取出來的測試集\n",
- "# for row in fold:\n",
- "# row_copy = list(row)\n",
- "# row_copy[-1] = None\n",
- "# test_set.append(row_copy)\n",
- " predicted = algorithm(train_set, fold, *args)\n",
- " \n",
- " actual = [row[-1] for row in fold]\n",
- "\n",
- " # 計算隨機森林的預測結果的正確率\n",
- " accuracyValue = accuracy(predicted,actual)\n",
- " scores.append(accuracyValue)\n",
- " return scores"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "隨機因子= 0.13436424411240122\n",
- "決策樹個數: 1\n",
- "模型得分: [87.8048780487805, 90.2439024390244, 92.6829268292683, 85.36585365853658, 95.1219512195122]\n",
- "平均準確度: 90.244%\n",
- "隨機因子= 0.13436424411240122\n",
- "決策樹個數: 10\n",
- "模型得分: [92.6829268292683, 92.6829268292683, 87.8048780487805, 78.04878048780488, 100.0]\n",
- "平均準確度: 90.244%\n"
- ]
- }
- ],
- "source": [
- " \n",
- " #綜合測試函數\n",
- " n_folds = 5 # 分成5份數據,進行交叉驗證\n",
- " max_depth = 20 # 調參(自己修改) #決策樹深度不能太深,不然容易導致過擬合\n",
- " min_size = 1 # 決策樹的葉子節點最少的元素數量\n",
- " sample_size = 1.0 # 做決策樹時候的樣本的比例\n",
- " # n_features = int((len(dataset[0])-1))\n",
- " n_features = 15 # 調參(自己修改) #準確性與多樣性之間的權衡\n",
- " for n_trees in [1, 10]: # 理論上樹是越多越好\n",
- " scores = evaluteAlgorithm(dataSet, randomForest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)\n",
- " # 每一次執行本文件時都能產生同一個隨機數\n",
- " random.seed(1)\n",
- " print('隨機因子=', random.random()) # 每一次執行本文件時都能產生同一個隨機數\n",
- " print('決策樹個數: %d' % n_trees) # 輸出決策樹個數\n",
- " print('模型得分: %s' % scores) # 輸出五份隨機樣本的模型得分\n",
- " print('平均準確度: %.3f%%' % (sum(scores)/float(len(scores)))) # 輸出五份隨機樣本的平均準確度\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "隨機因子= 0.13436424411240122\n",
- "決策樹個數: 1\n",
- "模型得分: [80.48780487804879, 75.60975609756098, 73.17073170731707, 75.60975609756098, 78.04878048780488]\n",
- "平均準確度: 76.585%\n",
- "隨機因子= 0.13436424411240122\n",
- "決策樹個數: 10\n",
- "模型得分: [87.8048780487805, 85.36585365853658, 90.2439024390244, 78.04878048780488, 92.6829268292683]\n",
- "平均準確度: 86.829%\n"
- ]
- }
- ],
- "source": [
- " sample_size =0.5 # 做決策樹時候的樣本的比例\n",
- " \n",
- " for n_trees in [1, 10]: # 理論上樹是越多越好\n",
- " scores = evaluteAlgorithm(dataSet, randomForest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)\n",
- " # 每一次執行本文件時都能產生同一個隨機數\n",
- " random.seed(1)\n",
- " print('隨機因子=', random.random()) # 每一次執行本文件時都能產生同一個隨機數\n",
- " print('決策樹個數: %d' % n_trees) # 輸出決策樹個數\n",
- " print('模型得分: %s' % scores) # 輸出五份隨機樣本的模型得分\n",
- " print('平均準確度: %.3f%%' % (sum(scores)/float(len(scores)))) # 輸出五份隨機樣本的平均準確度"
- ]
- }
- ],
- 余下見附件
復制代碼
全部資料51hei下載地址:
隨機森林例子.zip
(99.15 KB, 下載次數: 11)
2020-12-18 11:04 上傳
點擊文件名下載附件
下載積分: 黑幣 -5
|