{"id":962,"date":"2022-10-16T20:33:26","date_gmt":"2022-10-16T12:33:26","guid":{"rendered":"http:\/\/viplao.com\/?p=962"},"modified":"2023-09-10T22:53:17","modified_gmt":"2023-09-10T14:53:17","slug":"%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5","status":"publish","type":"post","link":"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/","title":{"rendered":"\u8fd0\u7ef4\u5de5\u5177 &#8211; \u5927\u4f17\u70b9\u8bc4\u8bc4\u8bba\u6587\u672c\u6316\u6398\u57fa\u7840\u6848\u4f8b\u5b9e\u8df5"},"content":{"rendered":"\n<div id=\"ez-toc-container\" class=\"ez-toc-v2_0_71 counter-hierarchy ez-toc-counter ez-toc-grey ez-toc-container-direction\">\n<div class=\"ez-toc-title-container\">\n<p class=\"ez-toc-title\" style=\"cursor:inherit\">\u6587\u7ae0\u76ee\u5f55<\/p>\n<span class=\"ez-toc-title-toggle\"><a href=\"#\" class=\"ez-toc-pull-right ez-toc-btn ez-toc-btn-xs ez-toc-btn-default ez-toc-toggle\" aria-label=\"Toggle Table of Content\"><span class=\"ez-toc-js-icon-con\"><span class=\"\"><span class=\"eztoc-hide\" style=\"display:none;\">Toggle<\/span><span class=\"ez-toc-icon-toggle-span\"><svg style=\"fill: #999;color:#999\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" class=\"list-377408\" width=\"20px\" height=\"20px\" viewBox=\"0 0 24 24\" fill=\"none\"><path d=\"M6 6H4v2h2V6zm14 0H8v2h12V6zM4 11h2v2H4v-2zm16 0H8v2h12v-2zM4 16h2v2H4v-2zm16 0H8v2h12v-2z\" fill=\"currentColor\"><\/path><\/svg><svg style=\"fill: #999;color:#999\" class=\"arrow-unsorted-368013\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"10px\" height=\"10px\" viewBox=\"0 0 24 24\" version=\"1.2\" baseProfile=\"tiny\"><path d=\"M18.2 9.3l-6.2-6.3-6.2 6.3c-.2.2-.3.4-.3.7s.1.5.3.7c.2.2.4.3.7.3h11c.3 0 .5-.1.7-.3.2-.2.3-.5.3-.7s-.1-.5-.3-.7zM5.8 14.7l6.2 6.3 6.2-6.3c.2-.2.3-.5.3-.7s-.1-.5-.3-.7c-.2-.2-.4-.3-.7-.3h-11c-.3 0-.5.1-.7.3-.2.2-.3.5-.3.7s.1.5.3.7z\"\/><\/svg><\/span><\/span><\/span><\/a><\/span><\/div>\n<nav><ul class='ez-toc-list ez-toc-list-level-1 eztoc-toggle-hide-by-default' ><ul class='ez-toc-list-level-2' ><li class='ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-1\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E4%B8%80%E3%80%81%E7%88%AC%E8%99%AB\" title=\"\u4e00\u3001\u722c\u866b\">\u4e00\u3001\u722c\u866b<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-2\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E6%95%B4%E4%BD%93%E6%80%9D%E8%B7%AF\" title=\"\u6574\u4f53\u601d\u8def\">\u6574\u4f53\u601d\u8def<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-3\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E7%BD%91%E9%A1%B5%E7%88%AC%E5%8F%96%E5%92%8C%E8%A7%A3%E6%9E%90\" title=\"\u7f51\u9875\u722c\u53d6\u548c\u89e3\u6790\">\u7f51\u9875\u722c\u53d6\u548c\u89e3\u6790<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-4\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E6%95%B0%E6%8D%AE%E5%AD%98%E5%82%A8\" title=\"\u6570\u636e\u5b58\u50a8\">\u6570\u636e\u5b58\u50a8<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-5\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E5%8F%8D%E7%88%AC%E8%99%AB%E5%AF%B9%E6%8A%97\" title=\"\u53cd\u722c\u866b\u5bf9\u6297\">\u53cd\u722c\u866b\u5bf9\u6297<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-6\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E4%BA%8C%E3%80%81%E6%8E%A2%E7%B4%A2%E6%80%A7%E5%88%86%E6%9E%90%E4%B8%8E%E6%96%87%E6%9C%AC%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86\" title=\"\u4e8c\u3001\u63a2\u7d22\u6027\u5206\u6790\u4e0e\u6587\u672c\u6570\u636e\u9884\u5904\u7406\">\u4e8c\u3001\u63a2\u7d22\u6027\u5206\u6790\u4e0e\u6587\u672c\u6570\u636e\u9884\u5904\u7406<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-7\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E6%8E%A2%E7%B4%A2%E6%80%A7%E5%88%86%E6%9E%90\" title=\"\u63a2\u7d22\u6027\u5206\u6790\">\u63a2\u7d22\u6027\u5206\u6790<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-8\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86\" title=\"\u6570\u636e\u9884\u5904\u7406\">\u6570\u636e\u9884\u5904\u7406<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-9\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E8%AF%8D%E4%BA%91%E5%B1%95%E7%A4%BA\" title=\"\u8bcd\u4e91\u5c55\u793a\">\u8bcd\u4e91\u5c55\u793a<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-10\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E4%B8%89%E3%80%81%E6%96%87%E6%9C%AC%E7%9A%84%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90\" title=\"\u4e09\u3001\u6587\u672c\u7684\u60c5\u611f\u5206\u6790\">\u4e09\u3001\u6587\u672c\u7684\u60c5\u611f\u5206\u6790<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-11\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E6%96%87%E6%9C%AC%E7%89%B9%E5%BE%81%E6%8F%90%E5%8F%96%EF%BC%88TF-IDF%EF%BC%89\" title=\"\u6587\u672c\u7279\u5f81\u63d0\u53d6\uff08TF-IDF\uff09\">\u6587\u672c\u7279\u5f81\u63d0\u53d6\uff08TF-IDF\uff09<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-12\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%BB%BA%E6%A8%A1\" title=\"\u673a\u5668\u5b66\u4e60\u5efa\u6a21\">\u673a\u5668\u5b66\u4e60\u5efa\u6a21<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-13\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E6%A0%B7%E6%9C%AC%E6%95%B0%E6%8D%AE%E4%B8%8D%E5%B9%B3%E8%A1%A1\" title=\"\u6837\u672c\u6570\u636e\u4e0d\u5e73\u8861\">\u6837\u672c\u6570\u636e\u4e0d\u5e73\u8861<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-14\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#%E6%A8%A1%E5%9E%8B%E8%AF%84%E4%BC%B0%E6%B5%8B%E8%AF%95\" title=\"\u6a21\u578b\u8bc4\u4f30\u6d4b\u8bd5\">\u6a21\u578b\u8bc4\u4f30\u6d4b\u8bd5<\/a><\/li><\/ul><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-15\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#1_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%9B%B8%E5%85%B3%E7%9A%84%E7%BB%9F%E8%AE%A1%E5%AD%A6%E7%9F%A5%E8%AF%86\" title=\"1. \u6734\u7d20\u8d1d\u53f6\u65af\u76f8\u5173\u7684\u7edf\u8ba1\u5b66\u77e5\u8bc6\">1. \u6734\u7d20\u8d1d\u53f6\u65af\u76f8\u5173\u7684\u7edf\u8ba1\u5b66\u77e5\u8bc6<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-16\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#_2_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%9A%84%E6%A8%A1%E5%9E%8B\" title=\"&nbsp;2. \u6734\u7d20\u8d1d\u53f6\u65af\u7684\u6a21\u578b\">&nbsp;2. \u6734\u7d20\u8d1d\u53f6\u65af\u7684\u6a21\u578b<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-17\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#3_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%9A%84%E6%8E%A8%E6%96%AD%E8%BF%87%E7%A8%8B\" title=\"3.&nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u63a8\u65ad\u8fc7\u7a0b\">3.&nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u63a8\u65ad\u8fc7\u7a0b<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-18\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#4_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%9A%84%E5%8F%82%E6%95%B0%E4%BC%B0%E8%AE%A1\" title=\"4.&nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u53c2\u6570\u4f30\u8ba1\">4.&nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u53c2\u6570\u4f30\u8ba1<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-19\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#5_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%AE%97%E6%B3%95%E8%BF%87%E7%A8%8B\" title=\"5. &nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u8fc7\u7a0b\">5. &nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u8fc7\u7a0b<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-20\" href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/#6_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%AE%97%E6%B3%95%E5%B0%8F%E7%BB%93\" title=\"6.&nbsp;&nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u5c0f\u7ed3\">6.&nbsp;&nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u5c0f\u7ed3<\/a><\/li><\/ul><\/nav><\/div>\n<h2 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E4%B8%80%E3%80%81%E7%88%AC%E8%99%AB\"><\/span>\u4e00\u3001\u722c\u866b<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E6%95%B4%E4%BD%93%E6%80%9D%E8%B7%AF\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E6%95%B4%E4%BD%93%E6%80%9D%E8%B7%AF\"><\/a>\u6574\u4f53\u601d\u8def<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<p>\u722c\u53d6\u5927\u4f17\u70b9\u8bc4\u5341\u5927\u70ed\u95e8\u7cd6\u6c34\u5e97\u7684\u8bc4\u8bba\uff0c\u722c\u53d6\u7f51\u9875\u540e\u4ecehtml\u9875\u9762\u4e2d\u628a\u9700\u8981\u7684\u5b57\u6bb5\u4fe1\u606f\uff08\u987e\u5ba2id\u3001\u8bc4\u8bba\u65f6\u95f4\u3001\u8bc4\u5206\u3001\u8bc4\u8bba\u5185\u5bb9\u3001\u53e3\u5473\u3001\u73af\u5883\u3001\u670d\u52a1\u3001\u5e97\u94faID\uff09\u63d0\u53d6\u51fa\u6765\u5e76\u5b58\u50a8\u5230MYSQL\u6570\u636e\u5e93\u4e2d\u3002 <\/p>\n\n\n\n<p><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining\">https:\/\/github.com\/wangkaikai07\/dianping_textmining<\/a> \uff08\u5b9e\u8df5\u53ef\u6b63\u5e38\u8fd0\u884c\u4e0e\u5b66\u4e60\uff09<\/p>\n\n\n\n<p><a href=\"https:\/\/github.com\/datawhalechina\/fun-rec\">https:\/\/github.com\/datawhalechina\/fun-rec<\/a><\/p>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E7%BD%91%E9%A1%B5%E7%88%AC%E5%8F%96%E5%92%8C%E8%A7%A3%E6%9E%90\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E7%BD%91%E9%A1%B5%E7%88%AC%E5%8F%96%E5%92%8C%E8%A7%A3%E6%9E%90\"><\/a>\u7f51\u9875\u722c\u53d6\u548c\u89e3\u6790<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<p>\u94fe\u63a5\u683c\u5f0f\u4e3a&#8221;<a href=\"http:\/\/www.dianping.com\/shop\/\">http:\/\/www.dianping.com\/shop\/<\/a>&#8221; + shopID + &#8220;\/review_all\/&#8221; + pi\uff0c\u5982\uff1a<a href=\"http:\/\/www.dianping.com\/shop\/518986\/review_all\/p1\">http:\/\/www.dianping.com\/shop\/518986\/review_all\/p1<\/a>&nbsp;\uff0c\u4e00\u9875\u8bc4\u8bba\u670920\u6761\u3002\u6211\u4eec\u4f7f\u7528for\u5faa\u73af\u6784\u9020\u94fe\u63a5URL\uff0c\u4f7f\u7528requests\u5e93\u53d1\u8d77\u8bf7\u6c42\u5e76\u628ahtml\u9875\u9762\u722c\u53d6\u4e0b\u6765\uff0c\u901a\u8fc7BeautifulSoup\u548cre\u5e93\u89e3\u6790\u9875\u9762\u63d0\u53d6\u4fe1\u606f\u3002<\/p>\n\n\n\n<p>\u6211\u4eec\u53d1\u73b0\u5b8c\u6574\u7684\u8bc4\u8bba\u90fd\u5b58\u50a8\u5728&#8217;div&#8217;,&#8217;main-review&#8217;\u4e2d\uff0c\u4e14\u90e8\u5206\u9875\u9762\u53e3\u5473\u3001\u73af\u5883\u3001\u670d\u52a1\u5e76\u4e0d\u662f\u6bcf\u4e00\u9875\u90fd\u6709\uff0c\u56e0\u6b64\u9700\u8981\u4f7f\u7528try&#8230;except&#8230;\u9632\u6b62\u7a0b\u5e8f\u4e2d\u65ad\uff0cBeautifulSoup\u90e8\u5206\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">for item in soup('div','main-review'):\n    cus_id = item.find('a','name').text.strip()\n    comment_time = item.find('span','time').text.strip()\n    comment_star = item.find('span',re.compile('sml-rank-stars')).get('class')[1]\n    cus_comment = item.find('div',\"review-words\").text.strip()\n    scores = str(item.find('span','score'))\n    try:\n        kouwei = re.findall(r'\u53e3\u5473\uff1a([\\u4e00-\\u9fa5]*)',scores)[0]\n        huanjing = re.findall(r'\u73af\u5883\uff1a([\\u4e00-\\u9fa5]*)',scores)[0]\n        fuwu = re.findall(r'\u670d\u52a1\uff1a([\\u4e00-\\u9fa5]*)',scores)[0]\n        except:\n            kouwei = huanjing = fuwu = '\u65e0'<\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E6%95%B0%E6%8D%AE%E5%AD%98%E5%82%A8\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E6%95%B0%E6%8D%AE%E5%AD%98%E5%82%A8\"><\/a>\u6570\u636e\u5b58\u50a8<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<p>\u6211\u4eec\u4f7f\u7528MYSQL\u6570\u636e\u5e93\uff0c\u5b89\u88c5\u6559\u7a0b\u53c2\u8003<a href=\"http:\/\/www.runoob.com\/mysql\/mysql-install.html\">\u83dc\u9e1f\u6559\u7a0b<\/a>\uff0cpython\u8fde\u63a5MYSQL\u6570\u636e\u63a8\u8350\u4f7f\u7528pymysql\uff0c\u540c\u6837\u662f\u63a8\u8350\u83dc\u9e1f\u6559\u7a0b<a href=\"http:\/\/www.runoob.com\/python3\/python3-mysql.html\">\u83dc\u9e1f\u6559\u7a0b<\/a>\u3002\u6211\u4eec\u9700\u8981\u5148\u5efa\u7acb\u4e00\u4e2a\u6570\u636e\u5e93\u548c\u8868\uff0c\u7136\u540e\u8fde\u63a5\u5e76\u5b9a\u4e49\u6e38\u6807\uff0c\u7136\u540e\u5199\u5bf9\u5e94\u7684sql\u8bed\u53e5\uff0c\u6700\u540e\u6267\u884c\u4e8b\u52a1\uff0c\u5b58\u50a8\u90e8\u5206\u7684\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">#\u8fde\u63a5MYSQL\u6570\u636e\u5e93\ndb = pymysql.connect(\"localhost\",\"root\",\"\",\"TESTDB\" )\ncursor = db.cursor()\n#\u5b58\u50a8\u722c\u53d6\u5230\u7684\u6570\u636e\ndef save_data(data_dict):\n    sql = '''INSERT INTO DZDP(cus_id, comment_time, comment_star, cus_comment, kouwei, huanjing,           fuwu, shopID) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)'''\n    value_tup = (data_dict['cus_id']\n                 ,data_dict['comment_time']\n                 ,data_dict['comment_star']\n                 ,data_dict['cus_comment']\n                 ,data_dict['kouwei']\n                 ,data_dict['huanjing']\n                 ,data_dict['fuwu']\n                 ,data_dict['shopID']\n                 )\n    try:\n        cursor.execute(sql,value_tup)\n        db.commit()\n    except:\n        print('\u6570\u636e\u5e93\u5199\u5165\u5931\u8d25')\n    return<\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E5%8F%8D%E7%88%AC%E8%99%AB%E5%AF%B9%E6%8A%97\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E5%8F%8D%E7%88%AC%E8%99%AB%E5%AF%B9%E6%8A%97\"><\/a>\u53cd\u722c\u866b\u5bf9\u6297<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<ol>\n<li><strong>\u4fee\u6539\u8bf7\u6c42\u5934\u4e2d\u6d4f\u89c8\u5668\u4fe1\u606f<\/strong>\uff1a\u4f7f\u7528fake_useragent\u7b2c\u4e09\u65b9\u5e93\uff0c\u4fee\u6539request\u4e2d\u7684headers\u53c2\u6570\uff0c\u7528\u6cd5\u5982\u4e0b\uff1afrom fake_useragent import UserAgent ua = UserAgent() headers = {&#8216;User-Agent&#8217;:ua.random}<\/li>\n\n\n\n<li><strong>\u8bbe\u7f6e\u8df3\u8f6c\u8def\u5f84<\/strong>\uff1a\u5728\u8bbf\u95ee\u8bc4\u8bba\u65f6\uff0c\u4e00\u822c\u7684\u6d4f\u89c8\u884c\u4e3a\u662f\u4ece\u67d0\u4e00\u9875\u8df3\u8f6c\u5230\u4e0b\u4e00\u9875\u8fd9\u6837\u7684\uff0c\u800c\u4e0d\u662f\u76f4\u63a5\u901a\u8fc7\u8fde\u63a5\u8bbf\u95ee\uff0c\u4e3a\u4e86\u66f4\u597d\u7684\u4f2a\u88c5\u6210\u4e00\u4e2a\u6b63\u5e38\u7684\u8bbf\u95ee\uff0c\u6211\u4eec\u9700\u8981\u8bbe\u7f6e\u4e00\u4e0b\u8df3\u8f6c\u7684\u8def\u5f84\uff0c\u4fee\u6539headers\u4e2d\u7684Referer\u53c2\u6570headers = { &#8216;User-Agent&#8217;:ua.random, &#8216;Cookie&#8217;:cookie, &#8216;Referer&#8217;: &#8216;http:\/\/www.dianping.com\/shop\/518986\/review_all&#8217; }<\/li>\n\n\n\n<li><strong>\u8bbe\u7f6eCookies<\/strong>\uff1a\u8bc4\u8bba\u6570\u636e\u9700\u8981\u767b\u5f55\u540e\u624d\u80fd\u83b7\u53d6\uff0c\u4e0b\u9762\u4ecb\u7ecd\u4e00\u79cd\u975e\u5e38\u7b80\u5355\u65b9\u4fbf\u7684\u7ed5\u8fc7\u767b\u5f55\u7684\u65b9\u6cd5\u3002\n<ul>\n<li>\u5728\u7f51\u9875\u4e0a\u8fdb\u884c\u767b\u5f55<\/li>\n\n\n\n<li>\u4f7f\u7528Chrome\u6d4f\u89c8\u5668\u7684\u5f00\u53d1\u8005\u5de5\u5177\uff0c\u67e5\u8be2\u5f53\u524d\u8bf7\u6c42\u7684cookie<\/li>\n\n\n\n<li>\u590d\u5236\u6d4f\u89c8\u5668\u4e2d\u7684cookie\uff0c\u4f7f\u7528\u6b64cookie\u5bf9\u6211\u4eec\u7684\u8bf7\u6c42\u8fdb\u884c\u4f2a\u88c5<\/li>\n<\/ul>\n<\/li>\n\n\n\n<li><strong>\u4f7f\u7528IP\u4ee3\u7406\u6c60<\/strong>\uff1a\u8fd9\u91cc\u4f7f\u7528\u897f\u523a\u4ee3\u7406\u7684\u514d\u8d39\u4ee3\u7406\uff0c\u6784\u5efa\u4e00\u4e2a\u722c\u866b\u722c\u53d6\u897f\u523a\u4ee3\u7406\u7684ip\uff0c\u7136\u540e\u8fdb\u884c\u9a8c\u8bc1\uff0c\u7b5b\u6389\u4e0d\u53ef\u7528\u7684ip\uff0c\u6784\u5efa\u51faip\u6c60\u4f9b\u540e\u7eed\u8c03\u7528\uff0c\u4ee3\u7801\u6765\u81ea\u7f51\u7edc\u3002\u4f46\u662f\u7ecf\u8fc7\u6d4b\u8bd5\uff0c\u5927\u4f17\u70b9\u8bc4\u5bf9\u4e00\u4e2a\u8d26\u53f7\u4e0d\u540cip\u8bbf\u95ee\u76d1\u63a7\u975e\u5e38\u4e25\u683c\uff0c\u4f7f\u7528IP\u4ee3\u7406\u6c60\u4e0d\u66f4\u6362\u8d26\u53f7\u7684\u8bdd\uff0c\u6b7b\u7684\u66f4\u5feb\uff0c\u5c01\u4f60\u8d26\u53f7\uff0c\u7136\u800c\u6784\u5efa\u8d26\u53f7\u6c60\u6bd4\u8f83\u9ebb\u70e6\uff0c\u6211\u4eec\u5148\u6682\u7f13\u3002<\/li>\n\n\n\n<li><strong>\u964d\u4f4e\u722c\u53d6\u9891\u7387<\/strong>\uff1a\u4e00\u4e2a\u7b80\u5355\u53c8\u6709\u6548\u7684\u65b9\u6cd5\u5c31\u662f\u964d\u4f4e\u722c\u53d6\u9891\u7387\uff0c\u6bd5\u7adf\u9ad8\u9891\u7387\u7684\u722c\u53d6\u5bf9\u670d\u52a1\u5668\u4e5f\u662f\u4e00\u4e2a\u8003\u9a8c\uff0c\u5982\u679c\u5bf9\u901f\u5ea6\u7684\u8981\u6c42\u4e0d\u662f\u5f88\u9ad8\u7684\u8bdd\uff0c\u5efa\u8bae\u628a\u9891\u7387\u653e\u6162\u4e00\u70b9\uff0c\u4f60\u597d\u6211\u597d\u5927\u5bb6\u597d\uff01import random import time time.sleep(6*random.random() + 4)<\/li>\n\n\n\n<li><strong>\u8bbe\u7f6e\u65ad\u70b9\u7eed\u4f20<\/strong>\uff1a\u5373\u4f7f\u964d\u4f4e\u4e86\u722c\u53d6\u9891\u7387\uff0c\u6709\u65f6\u8fd8\u662f\u4f1a\u88ab\u7f8e\u56e2\u7684\u7f51\u7edc\u5de5\u7a0b\u5e08\u6293\u5230\u7684\uff0c\u5c0f\u54e5\u54e5\u9976\u547d\u554a~\u3002\u56e0\u6b64\u6211\u4eec\u9700\u8981\u4e00\u4e2a\u65ad\u70b9\u7eed\u4f20\u7684\u5c0f\u529f\u80fd\uff0c\u907f\u514d\u6bcf\u6b21\u90fd\u4ece\u5934\u5f00\u59cb\u722c\u3002\u601d\u8def\u662f\u5efa\u4e00\u4e2a\u6587\u672c\u6587\u4ef6\uff0c\u5b58\u50a8\u5f53\u524d\u722c\u53d6\u7684\u8fdb\u5ea6\uff0c\u6bcf\u6b21\u8fd0\u884c\u7a0b\u5e8f\u65f6\u90fd\u51fa\u5f53\u524d\u8fdb\u5ea6\u5f00\u59cb\uff0c\u8be6\u89c1\u4ee3\u7801~<\/li>\n<\/ol>\n\n\n\n<h2 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E4%BA%8C%E3%80%81%E6%8E%A2%E7%B4%A2%E6%80%A7%E5%88%86%E6%9E%90%E4%B8%8E%E6%96%87%E6%9C%AC%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E4%BA%8C%E6%8E%A2%E7%B4%A2%E6%80%A7%E5%88%86%E6%9E%90%E4%B8%8E%E6%96%87%E6%9C%AC%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86\"><\/a>\u4e8c\u3001\u63a2\u7d22\u6027\u5206\u6790\u4e0e\u6587\u672c\u6570\u636e\u9884\u5904\u7406<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E6%8E%A2%E7%B4%A2%E6%80%A7%E5%88%86%E6%9E%90\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E6%8E%A2%E7%B4%A2%E6%80%A7%E5%88%86%E6%9E%90\"><\/a>\u63a2\u7d22\u6027\u5206\u6790<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<ol>\n<li>\u67e5\u770b\u6570\u636e\u5927\u5c0f\u4ee5\u53ca\u57fa\u7840\u4fe1\u606f \uff0c\u6d4f\u89c8\u6570\u636e<a target=\"_blank\" rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining\/blob\/master\/%E6%96%87%E6%9C%AC%E5%88%86%E6%9E%90%E6%8C%96%E6%8E%98\/source\/data_head.png\"><\/a><\/li>\n\n\n\n<li>\u6837\u672c\u5206\u5e03<a target=\"_blank\" rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining\/blob\/master\/%E6%96%87%E6%9C%AC%E5%88%86%E6%9E%90%E6%8C%96%E6%8E%98\/source\/stars.png\"><\/a><\/li>\n\n\n\n<li>\u5404\u5e97\u94fa\u8bc4\u5206\u5206\u5e03<a target=\"_blank\" rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining\/blob\/master\/%E6%96%87%E6%9C%AC%E5%88%86%E6%9E%90%E6%8C%96%E6%8E%98\/source\/dianpu.png\"><\/a><\/li>\n\n\n\n<li>\u70b9\u8bc4\u6570\u7684\u7684\u65f6\u95f4\u5206\u5e03<a target=\"_blank\" rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining\/blob\/master\/%E6%96%87%E6%9C%AC%E5%88%86%E6%9E%90%E6%8C%96%E6%8E%98\/source\/time.png\"><\/a><\/li>\n\n\n\n<li>\u67e5\u770b\u8bc4\u8bba\u957f\u5ea6\u5bf9\u7ed3\u679c\u5f71\u54cd<a target=\"_blank\" rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining\/blob\/master\/%E6%96%87%E6%9C%AC%E5%88%86%E6%9E%90%E6%8C%96%E6%8E%98\/source\/len.png\"><\/a><\/li>\n<\/ol>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86\"><\/a>\u6570\u636e\u9884\u5904\u7406<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<ol>\n<li><strong>\u53bb\u9664\u975e\u6587\u672c\u6570\u636e<\/strong>\uff1a\u53ef\u4ee5\u770b\u51fa\uff0c\u722c\u866b\u83b7\u53d6\u7684\u6570\u636e\u975e\u5e38\u591a\u7c7b\u4f3c\u201c\\xa0\u201d\u7684\u975e\u6587\u672c\u6570\u636e\uff0c\u800c\u4e14\u90fd\u8fd8\u6709\u4e00\u4e9b\u65e0\u610f\u4e49\u7684\u5e72\u6270\u6570\u636e\uff0c\u5982\u7ed3\u5c3e\u7684\u201c\u6536\u8d77\u8bc4\u8bba\u201d#\u9664\u53bb\u975e\u6587\u672c\u6570\u636e\u548c\u65e0\u610f\u4e49\u6587\u672c data[&#8216;cus_comment&#8217;] = data[&#8216;cus_comment&#8217;].str.replace(r'[^\\u4e00-\\u9fa5]&#8217;,&#8221;).str.replace(&#8216;\u6536\u8d77\u8bc4\u8bba&#8217;,&#8221;)<\/li>\n\n\n\n<li><strong>\u4e2d\u6587\u5206\u8bcd<\/strong>\uff1a\u4e2d\u6587\u6587\u672c\u6570\u636e\u5904\u7406\uff0c\u600e\u4e48\u80fd\u79bb\u5f00\u4e2d\u6587\u5206\u8bcd\u5462\uff0c\u6211\u4eec\u4f7f\u7528jieba\u5e93\uff0c\u7b80\u5355\u53c8\u597d\u7528\u3002\u8fd9\u91cc\u6211\u4eec\u628a\u6587\u672c\u5b57\u7b26\u4e32\u5904\u7406\u4e3a\u4ee5\u7a7a\u683c\u533a\u9694\u7684\u5206\u8bcd\u5b57\u7b26\u4e32#\u4e2d\u6587\u5206\u8bcd import jieba data[&#8216;cus_comment&#8217;] = data[&#8216;cus_comment&#8217;].apply(lambda x:&#8217; &#8216;.join(jieba.cut(x)))<\/li>\n\n\n\n<li><strong>\u53bb\u9664\u505c\u7528\u8bcd<\/strong>\uff1a\u6587\u672c\u4e2d\u6709\u5f88\u591a\u65e0\u6548\u7684\u8bcd\uff0c\u6bd4\u5982\u201c\u7740\u201d\uff0c\u201c\u548c\u201d\uff0c\u8fd8\u6709\u4e00\u4e9b\u6807\u70b9\u7b26\u53f7\uff0c\u8fd9\u4e9b\u6211\u4eec\u4e0d\u60f3\u5728\u6587\u672c\u5206\u6790\u7684\u65f6\u5019\u5f15\u5165\uff0c\u56e0\u6b64\u9700\u8981\u53bb\u6389\uff0c\u56e0\u4e3awordcloud\u548cTF-IDF\u90fd\u652f\u6301\u505c\u7528\u8bcd\uff0c\u56e0\u6b64\u5c31\u4e0d\u989d\u5916\u5904\u7406\u4e86<\/li>\n<\/ol>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E8%AF%8D%E4%BA%91%E5%B1%95%E7%A4%BA\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E8%AF%8D%E4%BA%91%E5%B1%95%E7%A4%BA\"><\/a>\u8bcd\u4e91\u5c55\u793a<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<figure class=\"wp-block-image\"><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining\/blob\/master\/%E6%96%87%E6%9C%AC%E5%88%86%E6%9E%90%E6%8C%96%E6%8E%98\/source\/wordcloud.png\" target=\"_blank\" rel=\"noreferrer noopener\"><img decoding=\"async\" src=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining\/raw\/master\/%E6%96%87%E6%9C%AC%E5%88%86%E6%9E%90%E6%8C%96%E6%8E%98\/source\/wordcloud.png\" alt=\"wordcloud\"\/><\/a><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E4%B8%89%E3%80%81%E6%96%87%E6%9C%AC%E7%9A%84%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E4%B8%89%E6%96%87%E6%9C%AC%E7%9A%84%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90\"><\/a>\u4e09\u3001\u6587\u672c\u7684\u60c5\u611f\u5206\u6790<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<p>\u5148\u4e0a\u7ed3\u679c\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th>\u7cd6\u6c34\u5e97\u7684\u8bc4\u8bba\u6587\u672c<\/th><th>\u6a21\u578b\u9884\u6d4b\u7684\u60c5\u611f\u8bc4\u5206<\/th><\/tr><\/thead><tbody><tr><td>&#8216;\u7cd6\u6c34\u5473\u9053\u4e0d\u9519\uff0c\u6ed1\u800c\u4e0d\u817b\uff0c\u8d5e\u4e00\u4e2a\uff0c\u4e0b\u6b21\u8fd8\u4f1a\u6765&#8217;<\/td><td>0.91<\/td><\/tr><tr><td>&#8216;\u5473\u9053\u4e00\u822c\uff0c\u6ca1\u5565\u7279\u70b9&#8217;<\/td><td>0.52<\/td><\/tr><tr><td>&#8216;\u6392\u961f\u8001\u534a\u5929\uff0c\u73af\u5883\u5f88\u5dee\uff0c\u5473\u9053\u4e00\u822c\u822c&#8217;<\/td><td>0.05<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p>\u6a21\u578b\u7684\u6548\u679c\u8fd8\u53ef\u4ee5\u7684\u6837\u5b50\uff0cyeah~\u63a5\u4e0b\u6765\u6211\u4eec\u597d\u597d\u8bb2\u8bb2\u600e\u4e48\u505a\u7684\u54c8\uff0c\u6211\u4eec\u901a\u8fc7\u722c\u866b\u722c\u53d6\u4e86\u5927\u4f17\u70b9\u8bc4\u5e7f\u5dde8\u5bb6\u6700\u70ed\u95e8\u7cd6\u6c34\u5e97\u76843W\u6761\u8bc4\u8bba\u4fe1\u606f\u4ee5\u53ca\u8bc4\u5206\u4f5c\u4e3a\u8bad\u7ec3\u6570\u636e\uff0c\u524d\u9762\u7684\u5206\u6790\u6211\u4eec\u5f97\u77e5<em>\u6837\u672c\u5f88\u4e0d\u5747\u8861<\/em>\u3002\u63a5\u4e0b\u6765\u6211\u4eec\u7684\u6574\u4f53\u601d\u8def\u5c31\u662f\uff1a\u6587\u672c\u7279\u5f81\u63d0\u53d6(TF-IDF)\u2014\u673a\u5668\u5b66\u4e60\u5efa\u6a21\u2014\u6a21\u578b\u8bc4\u4ef7\u3002<\/p>\n\n\n\n<p>\u6211\u4eec\u5148\u4e0d\u5904\u7406\u6837\u672c\u4e0d\u5747\u8861\u95ee\u9898\uff0c\u76f4\u63a5\u5efa\u6a21\u540e\u67e5\u770b\u7ed3\u679c\uff0c\u63a5\u4e0b\u6765\u6211\u4eec\u518d\u6309\u7167\u4e24\u79cd\u65b9\u6cd5\u5904\u7406\u6837\u672c\u4e0d\u5747\u8861\uff0c\u5bf9\u6bd4\u7ed3\u679c\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E6%96%87%E6%9C%AC%E7%89%B9%E5%BE%81%E6%8F%90%E5%8F%96%EF%BC%88TF-IDF%EF%BC%89\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E6%96%87%E6%9C%AC%E7%89%B9%E5%BE%81%E6%8F%90%E5%8F%96tf-idf\"><\/a>\u6587\u672c\u7279\u5f81\u63d0\u53d6\uff08TF-IDF\uff09<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<p>\u6a21\u578b\u4e0d\u80fd\u76f4\u63a5\u5904\u7406\u6587\u672c\u6570\u636e\uff0c\u56e0\u6b64\u9700\u8981\u5148\u628a\u6587\u672c\u6570\u636e\u8f6c\u4e3a\u5411\u91cf\uff0c\u65b9\u6cd5\u6709\u8bcd\u5e93\u8868\u793a\u6cd5\u3001TF-IDF\u3001word2vec\u7b49\uff0c\u63a8\u8350\u4e00\u7bc7\u6587\u7ae0\uff0c\u603b\u7ed3\u5f97\u4e0d\u9519&nbsp;<a href=\"https:\/\/zhuanlan.zhihu.com\/p\/44917421%E3%80%82\">https:\/\/zhuanlan.zhihu.com\/p\/44917421\u3002<\/a><\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">#\u4f7f\u7528TF-IDF\u8fdb\u884c\u6587\u672c\u8f6c\u5411\u91cf\u5904\u7406\nfrom sklearn.feature_extraction.text import TfidfVectorizer\ntv = TfidfVectorizer(stop_words=stopwords, max_features=3000, ngram_range=(1,2))\ntv.fit(x_train)<\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%BB%BA%E6%A8%A1\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%BB%BA%E6%A8%A1\"><\/a>\u673a\u5668\u5b66\u4e60\u5efa\u6a21<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<p>\u8fd9\u91cc\u6211\u4eec\u4f7f\u7528\u6587\u672c\u5206\u7c7b\u7684\u7ecf\u5178\u7b97\u6cd5\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\uff0c\u800c\u4e14\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u7684\u8ba1\u7b97\u91cf\u8f83\u5c11\u3002\u7279\u5f81\u503c\u662f\u8bc4\u8bba\u6587\u672c\u7ecf\u8fc7TF-IDF\u5904\u7406\u7684\u5411\u91cf\uff0c\u6807\u7b7e\u503c\u8bc4\u8bba\u7684\u5206\u7c7b\u5171\u4e24\u7c7b\uff0c\u597d\u8bc4\u662f1\uff0c\u5dee\u8bc4\u662f0\u3002\u60c5\u611f\u8bc4\u5206\u4e3a\u5206\u7c7b\u5668\u9884\u6d4b\u5206\u7c7b1\u7684\u6982\u7387\u503c\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">#\u8ba1\u7b97\u5206\u7c7b\u6548\u679c\u7684\u51c6\u786e\u7387\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.metrics import roc_auc_score, f1_score\nclassifier = MultinomialNB()\nclassifier.fit(tv.transform(x_train), y_train)\nclassifier.score(tv.transform(x_test), y_test)\n\n&gt;&gt;&gt;0.9275308869629356<\/pre>\n\n\n\n<p>\u53ef\u4ee5\u770b\u51fa\uff0c\u51c6\u786e\u7387\u975e\u5e38\u4e0d\u9519\u7684\u6837\u5b50<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">#\u4ece\u5927\u4f17\u70b9\u8bc4\u7f51\u627e\u4e24\u6761\u8bc4\u8bba\u6765\u6d4b\u8bd5\u4e00\u4e0b\ntest1 = '\u5f88\u597d\u5403\uff0c\u73af\u5883\u597d\uff0c\u6240\u6709\u5458\u5de5\u7684\u6001\u5ea6\u90fd\u5f88\u597d\uff0c\u4e0a\u83dc\u5feb\uff0c\u670d\u52a1\u4e5f\u5f88\u597d\uff0c\u5473\u9053\u597d\u5403\uff0c\u90fd\u662f\u7528\u84b8\u998f\u6c34\u716e\u7684\uff0c\u63a8\u8350\uff0c\u8d85\u597d\u5403' #5\u661f\u597d\u8bc4\ntest2 = '\u7cef\u7c73\u5916\u76ae\u4e0d\u7ef5\u6ed1\uff0c\u8c46\u6c99\u9985\u7c97\u8e81\uff0c\u6ca1\u6709\u9999\u751c\u5473\u300212\u5143\u4e00\u7897\u4e0d\u503c\u3002' #1\u661f\u5dee\u8bc4\nprint('\u597d\u8bc4\u5b9e\u4f8b\u7684\u6a21\u578b\u9884\u6d4b\u60c5\u611f\u5f97\u5206\u4e3a{}\\n\u5dee\u8bc4\u5b9e\u4f8b\u7684\u6a21\u578b\u9884\u6d4b\u60c5\u611f\u5f97\u5206\u4e3a{}'.format(ceshi(classifier,test1),ceshi(classifier,test2)))\n\n&gt;&gt;&gt;\u597d\u8bc4\u5b9e\u4f8b\u7684\u6a21\u578b\u9884\u6d4b\u60c5\u611f\u5f97\u5206\u4e3a0.8638082706675478\n&gt;&gt;&gt;\u5dee\u8bc4\u5b9e\u4f8b\u7684\u6a21\u578b\u9884\u6d4b\u60c5\u611f\u5f97\u5206\u4e3a0.7856544482460911<\/pre>\n\n\n\n<p>\u70b9\u8bc4\u7f51\u4e0a\u7684\u5b9e\u9645\u6d4b\u8bd5\u4e2d\uff0c5\u661f\u597d\u8bc4\u6a21\u578b\u9884\u6d4b\u51fa\u6765\u4e86\uff0c1\u661f\u5dee\u8bc4\u7f3a\u9884\u6d4b\u9519\u8bef\u3002\u4e3a\u4ec0\u4e48\u5462\uff1f\u6211\u4eec\u67e5\u770b\u4e00\u4e0b<strong>\u6df7\u6dc6\u77e9\u9635<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>&#91;  46,  385]\n&#91;   8, 4984]\n<\/code><\/pre>\n\n\n\n<p>\u53ef\u4ee5\u770b\u51fa\uff0c<strong>\u8d1f\u7c7b\u7684\u9884\u6d4b\u975e\u5e38\u4e0d\u51c6<\/strong>\uff0c433\u5355\u51c6\u786e\u9884\u6d4b\u4e3a\u8d1f\u7c7b\u7684\u53ea\u670915.7%\uff0c\u5e94\u8be5\u662f\u7531\u4e8e<strong>\u6570\u636e\u4e0d\u5e73\u8861<\/strong>\u5bfc\u81f4\u7684\uff0c\u6a21\u578b\u7684\u9ed8\u8ba4\u9608\u503c\u4e3a\u8f93\u51fa\u503c\u7684\u4e2d\u4f4d\u6570\u3002\u6bd4\u5982\u903b\u8f91\u56de\u5f52\u7684\u8f93\u51fa\u8303\u56f4\u4e3a[0,1]\uff0c\u5f53\u67d0\u4e2a\u6837\u672c\u7684\u8f93\u51fa\u5927\u4e8e0.5\u5c31\u4f1a\u88ab\u5212\u5206\u4e3a\u6b63\u4f8b\uff0c\u53cd\u4e4b\u4e3a\u53cd\u4f8b\u3002\u5728\u6570\u636e\u7684\u7c7b\u522b\u4e0d\u5e73\u8861\u65f6\uff0c\u91c7\u7528\u9ed8\u8ba4\u7684\u5206\u7c7b\u9608\u503c\u53ef\u80fd\u4f1a\u5bfc\u81f4\u8f93\u51fa\u5168\u90e8\u4e3a\u6b63\u4f8b\uff0c\u4ea7\u751f\u865a\u5047\u7684\u9ad8\u51c6\u786e\u5ea6\uff0c\u5bfc\u81f4\u5206\u7c7b\u5931\u8d25\u3002<\/p>\n\n\n\n<p>\u5904\u7406\u6837\u672c\u4e0d\u5747\u8861\u95ee\u9898\u7684\u65b9\u6cd5\uff0c\u9996\u5148\u53ef\u4ee5\u9009\u62e9\u8c03\u6574\u9608\u503c\uff0c\u4f7f\u5f97\u6a21\u578b\u5bf9\u4e8e\u8f83\u5c11\u7684\u7c7b\u522b\u66f4\u4e3a\u654f\u611f\uff0c\u6216\u8005\u9009\u62e9\u5408\u9002\u7684\u8bc4\u4f30\u6807\u51c6\uff0c\u6bd4\u5982ROC\u6216\u8005F1\uff0c\u800c\u4e0d\u662f\u51c6\u786e\u5ea6\uff08accuracy\uff09\u3002\u53e6\u5916\u4e00\u79cd\u65b9\u6cd5\u5c31\u662f\u901a\u8fc7\u91c7\u6837\uff08sampling\uff09\u6765\u8c03\u6574\u6570\u636e\u7684\u4e0d\u5e73\u8861\u3002\u5176\u4e2d\u6b20\u91c7\u6837\u629b\u5f03\u4e86\u5927\u90e8\u5206\u6b63\u4f8b\u6570\u636e\uff0c\u4ece\u800c\u5f31\u5316\u4e86\u5176\u5f71\u54cd\uff0c\u53ef\u80fd\u4f1a\u9020\u6210\u504f\u5dee\u5f88\u5927\u7684\u6a21\u578b\uff0c\u540c\u65f6\uff0c\u6570\u636e\u603b\u662f\u5b9d\u8d35\u7684\uff0c\u629b\u5f03\u6570\u636e\u662f\u5f88\u5962\u4f88\u7684\u3002\u53e6\u5916\u4e00\u79cd\u662f\u8fc7\u91c7\u6837\uff0c\u4e0b\u9762\u6211\u4eec\u5c31\u4f7f\u7528\u8fc7\u91c7\u6837\u65b9\u6cd5\u6765\u8c03\u6574\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E6%A0%B7%E6%9C%AC%E6%95%B0%E6%8D%AE%E4%B8%8D%E5%B9%B3%E8%A1%A1\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E6%A0%B7%E6%9C%AC%E6%95%B0%E6%8D%AE%E4%B8%8D%E5%B9%B3%E8%A1%A1\"><\/a>\u6837\u672c\u6570\u636e\u4e0d\u5e73\u8861<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<p>\u6700\u7b80\u5355\u7684\u8fc7\u91c7\u6837\u65b9\u6cd5\uff0c\u5c31\u662f\u7b80\u5355\u590d\u5236\u6cd5\u3002\u4f46\u5355\u7eaf\u7684\u91cd\u590d\u4e86\u53cd\u4f8b\uff0c\u4f1a\u8fc7\u5206\u5f3a\u8c03\u5df2\u6709\u7684\u53cd\u4f8b\u3002\u5982\u679c\u5176\u4e2d\u90e8\u5206\u70b9\u6807\u8bb0\u9519\u8bef\u6216\u8005\u662f\u566a\u97f3\uff0c\u90a3\u4e48\u9519\u8bef\u4e5f\u5bb9\u6613\u88ab\u6210\u500d\u7684\u653e\u5927\u3002\u56e0\u6b64\u6700\u5927\u7684\u98ce\u9669\u5c31\u662f\u5bf9\u53cd\u4f8b\u8fc7\u62df\u5408\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">#\u628a0\u7c7b\u6837\u672c\u590d\u523610\u6b21\uff0c\u6784\u9020\u8bad\u7ec3\u96c6\nindex_tmp = y_train==0\ny_tmp = y_train[index_tmp]\nx_tmp = x_train[index_tmp]\nx_train2 = pd.concat([x_train,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp])\ny_train2 = pd.concat([y_train,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp])\n\n#\u4f7f\u7528\u8fc7\u91c7\u6837\u6837\u672c(\u7b80\u5355\u590d\u5236)\u8fdb\u884c\u6a21\u578b\u8bad\u7ec3\uff0c\u5e76\u67e5\u770b\u51c6\u786e\u7387\nclf2 = MultinomialNB()\nclf2.fit(tv.transform(x_train2), y_train2)\ny_pred2 = clf2.predict_proba(tv.transform(x_test))[:,1]\nroc_auc_score(y_test,y_pred2)\n\n&gt;&gt;&gt;0.9049699937533463<\/pre>\n\n\n\n<p>\u67e5\u770b\u6b64\u65f6\u7684\u6df7\u6dc6\u77e9\u9635<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>&#91; 331,  100]\n&#91; 637, 4355]\n<\/code><\/pre>\n\n\n\n<p>\u53ef\u4ee5\u770b\u51fa\uff0c\u5373\u4f7f\u662f\u7b80\u5355\u7c97\u66b4\u7684\u590d\u5236\u6837\u672c\u6765\u5904\u7406\u6837\u672c\u4e0d\u5e73\u8861\u95ee\u9898\uff0c\u8d1f\u6837\u672c\u7684\u8bc6\u522b\u7387\u5927\u5e45\u4e0a\u5347\u4e86\uff0c\u53d8\u4e3a77%\uff0c\u6ee1\u6ee1\u7684\u5e78\u798f\u611f\u5440\u3002\u8fd8\u6709SMOTE\u8fc7\u91c7\u6837\u7b97\u6cd5\uff0cSMOTE\u662f\u5728\u5c40\u90e8\u533a\u57df\u901a\u8fc7K-\u8fd1\u90bb\u751f\u6210\u4e86\u65b0\u7684\u53cd\u4f8b\u3002\u76f8\u8f83\u4e8e\u7b80\u5355\u7684\u8fc7\u91c7\u6837\uff0cSMOTE\u964d\u4f4e\u4e86\u8fc7\u62df\u5408\u98ce\u9669\uff0c\u4f46\u540c\u65f6\u8fd0\u7b97\u5f00\u9500\u52a0\u5927\uff0c\u8be6\u7ec6\u8bf7\u770b\u5177\u4f53\u4ee3\u7801~<\/p>\n\n\n\n<h3 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"%E6%A8%A1%E5%9E%8B%E8%AF%84%E4%BC%B0%E6%B5%8B%E8%AF%95\"><\/span><a href=\"https:\/\/github.com\/wangkaikai07\/dianping_textmining#%E6%A8%A1%E5%9E%8B%E8%AF%84%E4%BC%B0%E6%B5%8B%E8%AF%95\"><\/a>\u6a21\u578b\u8bc4\u4f30\u6d4b\u8bd5<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<p>\u6211\u4eec\u628a3W\u6761\u6570\u636e\u90fd\u62ff\u6765\u8bad\u7ec3\uff0c\u6570\u636e\u91cf\u53d8\u591a\u4e86\uff0c\u6a21\u578b\u6548\u679c\u5e94\u8be5\u4f1a\u66f4\u597d<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">def fenxi(strings):\n    strings_fenci = fenci(pd.Series([strings]))\n    return float(clf.predict_proba(tv2.transform(strings_fenci))[:,1])\n\n#\u5230\u7f51\u4e0a\u627e\u4e00\u6761\u5dee\u8bc4\u6765\u6d4b\u8bd5\u4e00\u4e0b\nfenxi('\u7cef\u7c73\u5916\u76ae\u4e0d\u7ef5\u6ed1\uff0c\u8c46\u6c99\u9985\u7c97\u8e81\uff0c\u6ca1\u6709\u9999\u751c\u5473\u300212\u5143\u4e00\u7897\u4e0d\u503c\u3002')\n\n&gt;&gt;&gt;0.28900092243477077<\/pre>\n\n\n\n<p>\u53ea\u7528\u5230\u4e86\u7b80\u5355\u7684\u673a\u5668\u5b66\u4e60\uff0c\u5c31\u505a\u51fa\u4e86\u4e0d\u9519\u7684\u60c5\u611f\u5206\u6790\u6548\u679c\uff0c\u77e5\u8bc6\u7684\u529b\u91cf\u771f\u662f\u5f3a\u5927\u5440\uff0c666~<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import pandas as pd\nfrom matplotlib import pyplot as plt\nimport jieba\ndata = pd.read_csv('data.csv')\ndata.head()\n#\u6784\u5efalabel\u503c\ndef zhuanhuan(score):\n    if score &gt; 3:\n        return 1\n    elif score &lt; 3:\n        return 0\n    else:\n        return None\n    \n#\u7279\u5f81\u503c\u8f6c\u6362\ndata&#91;'target'] = data&#91;'stars'].map(lambda x:zhuanhuan(x))\ndata_model = data.dropna()\n\n#\u5207\u5206\u6d4b\u8bd5\u96c6\u3001\u8bad\u7ec3\u96c6\nfrom sklearn.model_selection import train_test_split\nx_train, x_test, y_train, y_test = train_test_split(data_model&#91;'cus_comment'], data_model&#91;'target'], random_state=3, test_size=0.25)\n\n#\u5f15\u5165\u505c\u7528\u8bcd\ninfile = open(\"stopwords.txt\",encoding='utf-8')\nstopwords_lst = infile.readlines()\nstopwords = &#91;x.strip() for x in stopwords_lst]\n\n#\u4e2d\u6587\u5206\u8bcd\ndef fenci(train_data):\n    words_df = train_data.apply(lambda x:' '.join(jieba.cut(x)))\n    return words_df\n \nx_train&#91;:5]\n#\u4f7f\u7528TF-IDF\u8fdb\u884c\u6587\u672c\u8f6c\u5411\u91cf\u5904\u7406\nfrom sklearn.feature_extraction.text import TfidfVectorizer\ntv = TfidfVectorizer(stop_words=stopwords, max_features=3000, ngram_range=(1,2))\ntv.fit(x_train)\n#\u8ba1\u7b97\u5206\u7c7b\u6548\u679c\u7684\u51c6\u786e\u7387\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.metrics import roc_auc_score, f1_score\nclassifier = MultinomialNB()\nclassifier.fit(tv.transform(x_train), y_train)\nclassifier.score(tv.transform(x_test), y_test)\n#\u8ba1\u7b97\u5206\u7c7b\u5668\u7684AUC\u503c\ny_pred = classifier.predict_proba(tv.transform(x_test))&#91;:,1]\nroc_auc_score(y_test,y_pred)\n\n#\u8ba1\u7b97\u4e00\u6761\u8bc4\u8bba\u6587\u672c\u7684\u60c5\u611f\u8bc4\u5206\ndef ceshi(model,strings):\n    strings_fenci = fenci(pd.Series(&#91;strings]))\n    return float(model.predict_proba(tv.transform(strings_fenci))&#91;:,1])\n\n#\u4ece\u5927\u4f17\u70b9\u8bc4\u7f51\u627e\u4e24\u6761\u8bc4\u8bba\u6765\u6d4b\u8bd5\u4e00\u4e0b\ntest1 = '\u5f88\u597d\u5403\uff0c\u73af\u5883\u597d\uff0c\u6240\u6709\u5458\u5de5\u7684\u6001\u5ea6\u90fd\u5f88\u597d\uff0c\u4e0a\u83dc\u5feb\uff0c\u670d\u52a1\u4e5f\u5f88\u597d\uff0c\u5473\u9053\u597d\u5403\uff0c\u90fd\u662f\u7528\u84b8\u998f\u6c34\u716e\u7684\uff0c\u63a8\u8350\uff0c\u8d85\u597d\u5403' #5\u661f\u597d\u8bc4\ntest2 = '\u7cef\u7c73\u5916\u76ae\u4e0d\u7ef5\u6ed1\uff0c\u8c46\u6c99\u9985\u7c97\u8e81\uff0c\u6ca1\u6709\u9999\u751c\u5473\u300212\u5143\u4e00\u7897\u4e0d\u503c\u3002' #1\u661f\u5dee\u8bc4\ntest3 = '\u7cef\u7c73\u5916\u76ae\u4e0d\u7ef5\u6ed1\uff0c\u5bf9\u5e94\u8fd9\u6837\u7684\u4ef7\u683c\uff0c\u6709\u4e9b\u5403\u4e8f\u3002' #1\u661f\u5dee\u8bc4\nprint('\u597d\u8bc4\u5b9e\u4f8b\u7684\u6a21\u578b\u9884\u6d4b\u60c5\u611f\u5f97\u5206\u4e3a{}\\n\u5dee\u8bc4\u5b9e\u4f8b\u7684\u6a21\u578b\u9884\u6d4b\u60c5\u611f\u5f97\u5206\u4e3a{}'.format(ceshi(classifier,test1),ceshi(classifier,test3)))\n\nfrom sklearn.metrics import confusion_matrix\ny_predict = classifier.predict(tv.transform(x_test))\ncm = confusion_matrix(y_test, y_predict)\ncm\ndata&#91;'target'].value_counts()\n\n#\u628a0\u7c7b\u6837\u672c\u590d\u523610\u6b21\uff0c\u6784\u9020\u8bad\u7ec3\u96c6\nindex_tmp = y_train==0\ny_tmp = y_train&#91;index_tmp]\nx_tmp = x_train&#91;index_tmp]\nx_train2 = pd.concat(&#91;x_train,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp,x_tmp])\ny_train2 = pd.concat(&#91;y_train,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp,y_tmp])\n\n#\u4f7f\u7528\u8fc7\u91c7\u6837\u6837\u672c(\u7b80\u5355\u590d\u5236)\u8fdb\u884c\u6a21\u578b\u8bad\u7ec3\uff0c\u5e76\u67e5\u770b\u51c6\u786e\u7387\nclf2 = MultinomialNB()\nclf2.fit(tv.transform(x_train2), y_train2)\ny_pred2 = clf2.predict_proba(tv.transform(x_test))&#91;:,1]\nroc_auc_score(y_test,y_pred2)\n\n#\u67e5\u770b\u6b64\u65f6\u7684\u6df7\u6dc6\u77e9\u9635\ny_predict2 = clf2.predict(tv.transform(x_test))\ncm = confusion_matrix(y_test, y_predict2)\ncm\n\n#ceshi(clf2,'\u6392\u961f\u4eba\u592a\u591a\uff0c\u73af\u5883\u4e0d\u597d\uff0c\u53e3\u5473\u4e00\u822c')\nceshi(clf2,'\u7cef\u7c73\u5916\u76ae\u4e0d\u7ef5\u6ed1\uff0c\u5bf9\u5e94\u8fd9\u6837\u7684\u4ef7\u683c\uff0c\u4e0d\u9519')\n\n#\u4f7f\u7528SMOTE\u8fdb\u884c\u6837\u672c\u8fc7\u91c7\u6837\u5904\u7406 fit_sample  fit_resample\nfrom imblearn.over_sampling import SMOTE\noversampler=SMOTE(random_state=0)\nx_train_vec = tv.transform(x_train)\nx_resampled, y_resampled = oversampler.fit_resample(x_train_vec, y_train)\n\n\n#\u539f\u59cb\u7684\u6837\u672c\u5206\u5e03\ny_train.value_counts()\n\n\n\n#\u7ecf\u8fc7SMOTE\u7b97\u6cd5\u8fc7\u91c7\u6837\u540e\u7684\u6837\u672c\u5206\u5e03\u60c5\u51b5\npd.Series(y_resampled).value_counts()\n\n\n#\u4f7f\u7528\u8fc7\u91c7\u6837\u6837\u672c(SMOTE)\u8fdb\u884c\u6a21\u578b\u8bad\u7ec3\uff0c\u5e76\u67e5\u770b\u51c6\u786e\u7387\nclf3 = MultinomialNB()\nclf3.fit(x_resampled, y_resampled)\ny_pred3 = clf3.predict_proba(tv.transform(x_test))&#91;:,1]\nroc_auc_score(y_test,y_pred3)\n\n#\u67e5\u770b\u6b64\u65f6\u7684\u51c6\u786e\u7387\ny_predict3 = clf3.predict(tv.transform(x_test))\ncm = confusion_matrix(y_test, y_predict3)\ncm\n\n#\u5230\u7f51\u4e0a\u627e\u4e00\u6761\u5dee\u8bc4\u6765\u6d4b\u8bd5\u4e00\u4e0b\u60c5\u611f\u8bc4\u5206\u7684\u9884\u6d4b\u6548\u679c\ntest3 = '\u7cef\u7c73\u5916\u76ae\u4e0d\u7ef5\u6ed1\uff0c\u8c46\u6c99\u9985\u7c97\u8e81\uff0c\u6ca1\u6709\u9999\u751c\u5473\u300212\u5143\u4e00\u7897\u4e0d\u503c\u3002'\ntest3 = '\u7cef\u7c73\u5916\u76ae\u4e0d\u7ef5\u6ed1\uff0c\u5bf9\u5e94\u8fd9\u6837\u7684\u4ef7\u683c\uff0c\u4e0d\u600e\u4e48\u5408\u9002\uff0c\u597d\u96be\u5403'\n\nceshi(clf3,test3)\n\n\n#\u8bcd\u5411\u91cf\u8bad\u7ec3\ntv2 = TfidfVectorizer(stop_words=stopwords, max_features=3000, ngram_range=(1,2))\ntv2.fit(data_model&#91;'cus_comment'])\n\n#SMOTE\u63d2\u503c\nX_tmp = tv2.transform(data_model&#91;'cus_comment'])\ny_tmp = data_model&#91;'target']\nsm = SMOTE(random_state=0)\nX,y = sm.fit_resample(X_tmp, y_tmp)\n\nclf = MultinomialNB()\nclf.fit(X, y)\n\ndef fenxi(strings):\n    strings_fenci = fenci(pd.Series(&#91;strings]))\n    return float(clf.predict_proba(tv2.transform(strings_fenci))&#91;:,1])\n\n#\u5230\u7f51\u4e0a\u627e\u4e00\u6761\u5dee\u8bc4\u6765\u6d4b\u8bd5\u4e00\u4e0b\nfenxi('\u7cef\u7c73\u5916\u76ae\u4e0d\u7ef5\u6ed1\uff0c\u8c46\u6c99\u9985\u7c97\u8e81\uff0c\u6ca1\u6709\u9999\u751c\u5473\u300212\u5143\u4e00\u7897\u4e0d\u503c\u3002')<\/code><\/pre>\n\n\n\n<p><\/p>\n\n\n\n<p>\u5728\u6240\u6709\u7684\u673a\u5668\u5b66\u4e60\u5206\u7c7b\u7b97\u6cd5\u4e2d\uff0c\u6734\u7d20\u8d1d\u53f6\u65af\u548c\u5176\u4ed6\u7edd\u5927\u591a\u6570\u7684\u5206\u7c7b\u7b97\u6cd5\u90fd\u4e0d\u540c\u3002\u5bf9\u4e8e\u5927\u591a\u6570\u7684\u5206\u7c7b\u7b97\u6cd5\uff0c\u6bd4\u5982\u51b3\u7b56\u6811,KNN,\u903b\u8f91\u56de\u5f52\uff0c\u652f\u6301\u5411\u91cf\u673a\u7b49\uff0c\u4ed6\u4eec\u90fd\u662f\u5224\u522b\u65b9\u6cd5\uff0c\u4e5f\u5c31\u662f\u76f4\u63a5\u5b66\u4e60\u51fa\u7279\u5f81\u8f93\u51faY\u548c\u7279\u5f81X\u4e4b\u95f4\u7684\u5173\u7cfb\uff0c\u8981\u4e48\u662f\u51b3\u7b56\u51fd\u6570Y=f(X)Y=f(X),\u8981\u4e48\u662f\u6761\u4ef6\u5206\u5e03P(Y|X)P(Y|X)\u3002\u4f46\u662f\u6734\u7d20\u8d1d\u53f6\u65af\u5374\u662f\u751f\u6210\u65b9\u6cd5\uff0c\u4e5f\u5c31\u662f\u76f4\u63a5\u627e\u51fa\u7279\u5f81\u8f93\u51faY\u548c\u7279\u5f81X\u7684\u8054\u5408\u5206\u5e03P(X,Y)P(X,Y),\u7136\u540e\u7528P(Y|X)=P(X,Y)\/P(X)P(Y|X)=P(X,Y)\/P(X)\u5f97\u51fa\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u6734\u7d20\u8d1d\u53f6\u65af\u5f88\u76f4\u89c2\uff0c\u8ba1\u7b97\u91cf\u4e5f\u4e0d\u5927\uff0c\u5728\u5f88\u591a\u9886\u57df\u6709\u5e7f\u6cdb\u7684\u5e94\u7528\uff0c\u8fd9\u91cc\u6211\u4eec\u5c31\u5bf9\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u539f\u7406\u505a\u4e00\u4e2a\u5c0f\u7ed3\u3002<\/p>\n\n\n\n<h1 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"1_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%9B%B8%E5%85%B3%E7%9A%84%E7%BB%9F%E8%AE%A1%E5%AD%A6%E7%9F%A5%E8%AF%86\"><\/span>1. \u6734\u7d20\u8d1d\u53f6\u65af\u76f8\u5173\u7684\u7edf\u8ba1\u5b66\u77e5\u8bc6<span class=\"ez-toc-section-end\"><\/span><\/h1>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5728\u4e86\u89e3\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u7b97\u6cd5\u4e4b\u524d\uff0c\u6211\u4eec\u9700\u8981\u5bf9\u76f8\u5173\u5fc5\u987b\u7684\u7edf\u8ba1\u5b66\u77e5\u8bc6\u505a\u4e00\u4e2a\u56de\u987e\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u8d1d\u53f6\u65af\u5b66\u6d3e\u5f88\u53e4\u8001\uff0c\u4f46\u662f\u4ece\u8bde\u751f\u5230\u4e00\u767e\u5e74\u524d\u4e00\u76f4\u4e0d\u662f\u4e3b\u6d41\u3002\u4e3b\u6d41\u662f\u9891\u7387\u5b66\u6d3e\u3002\u9891\u7387\u5b66\u6d3e\u7684\u6743\u5a01\u76ae\u5c14\u900a\u548c\u8d39\u6b47\u5c14\u90fd\u5bf9\u8d1d\u53f6\u65af\u5b66\u6d3e\u4e0d\u5c51\u4e00\u987e\uff0c\u4f46\u662f\u8d1d\u53f6\u65af\u5b66\u6d3e\u786c\u662f\u51ed\u501f\u5728\u73b0\u4ee3\u7279\u5b9a\u9886\u57df\u7684\u51fa\u8272\u5e94\u7528\u8868\u73b0\u4e3a\u81ea\u5df1\u8d62\u5f97\u4e86\u534a\u58c1\u6c5f\u5c71\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u8d1d\u53f6\u65af\u5b66\u6d3e\u7684\u601d\u60f3\u53ef\u4ee5\u6982\u62ec\u4e3a\u5148\u9a8c\u6982\u7387+\u6570\u636e=\u540e\u9a8c\u6982\u7387\u3002\u4e5f\u5c31\u662f\u8bf4\u6211\u4eec\u5728\u5b9e\u9645\u95ee\u9898\u4e2d\u9700\u8981\u5f97\u5230\u7684\u540e\u9a8c\u6982\u7387\uff0c\u53ef\u4ee5\u901a\u8fc7\u5148\u9a8c\u6982\u7387\u548c\u6570\u636e\u4e00\u8d77\u7efc\u5408\u5f97\u5230\u3002\u6570\u636e\u5927\u5bb6\u597d\u7406\u89e3\uff0c\u88ab\u9891\u7387\u5b66\u6d3e\u653b\u51fb\u7684\u662f\u5148\u9a8c\u6982\u7387\uff0c\u4e00\u822c\u6765\u8bf4\u5148\u9a8c\u6982\u7387\u5c31\u662f\u6211\u4eec\u5bf9\u4e8e\u6570\u636e\u6240\u5728\u9886\u57df\u7684\u5386\u53f2\u7ecf\u9a8c\uff0c\u4f46\u662f\u8fd9\u4e2a\u7ecf\u9a8c\u5e38\u5e38\u96be\u4ee5\u91cf\u5316\u6216\u8005\u6a21\u578b\u5316\uff0c\u4e8e\u662f\u8d1d\u53f6\u65af\u5b66\u6d3e\u5927\u80c6\u7684\u5047\u8bbe\u5148\u9a8c\u5206\u5e03\u7684\u6a21\u578b\uff0c\u6bd4\u5982\u6b63\u6001\u5206\u5e03\uff0cbeta\u5206\u5e03\u7b49\u3002\u8fd9\u4e2a\u5047\u8bbe\u4e00\u822c\u6ca1\u6709\u7279\u5b9a\u7684\u4f9d\u636e\uff0c\u56e0\u6b64\u4e00\u76f4\u88ab\u9891\u7387\u5b66\u6d3e\u8ba4\u4e3a\u5f88\u8352\u8c2c\u3002\u867d\u7136\u96be\u4ee5\u4ece\u4e25\u5bc6\u7684\u6570\u5b66\u903b\u8f91\u91cc\u63a8\u51fa\u8d1d\u53f6\u65af\u5b66\u6d3e\u7684\u903b\u8f91\uff0c\u4f46\u662f\u5728\u5f88\u591a\u5b9e\u9645\u5e94\u7528\u4e2d\uff0c\u8d1d\u53f6\u65af\u7406\u8bba\u5f88\u597d\u7528\uff0c\u6bd4\u5982\u5783\u573e\u90ae\u4ef6\u5206\u7c7b\uff0c\u6587\u672c\u5206\u7c7b\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u6211\u4eec\u5148\u770b\u770b\u6761\u4ef6\u72ec\u7acb\u516c\u5f0f\uff0c\u5982\u679cX\u548cY\u76f8\u4e92\u72ec\u7acb\uff0c\u5219\u6709\uff1a<\/p>\n\n\n\n<p>P(X,Y)=P(X)P(Y)P(X,Y)=P(X)P(Y)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u6211\u4eec\u63a5\u7740\u770b\u770b\u6761\u4ef6\u6982\u7387\u516c\u5f0f\uff1a<\/p>\n\n\n\n<p>P(Y|X)=P(X,Y)\/P(X)P(Y|X)=P(X,Y)\/P(X)<\/p>\n\n\n\n<p>P(X|Y)=P(X,Y)\/P(Y)P(X|Y)=P(X,Y)\/P(Y)<\/p>\n\n\n\n<p>\u6216\u8005\u8bf4:<\/p>\n\n\n\n<p>P(Y|X)=P(X|Y)P(Y)\/P(X)P(Y|X)=P(X|Y)P(Y)\/P(X)<\/p>\n\n\n\n<p>\u63a5\u7740\u770b\u770b\u5168\u6982\u7387\u516c\u5f0f<\/p>\n\n\n\n<p>P(X)=\u2211kP(X|Y=Yk)P(Yk)\u5176\u4e2d\u2211kP(Yk)=1P(X)=\u2211kP(X|Y=Yk)P(Yk)\u5176\u4e2d\u2211kP(Yk)=1<\/p>\n\n\n\n<p>\u4ece\u4e0a\u9762\u7684\u516c\u5f0f\u5f88\u5bb9\u6613\u5f97\u51fa\u8d1d\u53f6\u65af\u516c\u5f0f\uff1a<\/p>\n\n\n\n<p>P(Yk|X)=P(X|Yk)P(Yk)\u2211kP(X|Y=Yk)P(Yk)P(Yk|X)=P(X|Yk)P(Yk)\u2211kP(X|Y=Yk)P(Yk)<\/p>\n\n\n\n<h1 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"_2_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%9A%84%E6%A8%A1%E5%9E%8B\"><\/span>&nbsp;2. \u6734\u7d20\u8d1d\u53f6\u65af\u7684\u6a21\u578b<span class=\"ez-toc-section-end\"><\/span><\/h1>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u4ece\u7edf\u8ba1\u5b66\u77e5\u8bc6\u56de\u5230\u6211\u4eec\u7684\u6570\u636e\u5206\u6790\u3002\u5047\u5982\u6211\u4eec\u7684\u5206\u7c7b\u6a21\u578b\u6837\u672c\u662f\uff1a<\/p>\n\n\n\n<p>(x(1)1,x(1)2,&#8230;x(1)n,y1),(x(2)1,x(2)2,&#8230;x(2)n,y2),&#8230;(x(m)1,x(m)2,&#8230;x(m)n,ym)(x1(1),x2(1),&#8230;xn(1),y1),(x1(2),x2(2),&#8230;xn(2),y2),&#8230;(x1(m),x2(m),&#8230;xn(m),ym)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5373\u6211\u4eec\u6709m\u4e2a\u6837\u672c\uff0c\u6bcf\u4e2a\u6837\u672c\u6709n\u4e2a\u7279\u5f81\uff0c\u7279\u5f81\u8f93\u51fa\u6709K\u4e2a\u7c7b\u522b\uff0c\u5b9a\u4e49\u4e3aC1,C2,&#8230;,CKC1,C2,&#8230;,CK\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u4ece\u6837\u672c\u6211\u4eec\u53ef\u4ee5\u5b66\u4e60\u5f97\u5230\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u5148\u9a8c\u5206\u5e03P(Y=Ck)(k=1,2,&#8230;K)P(Y=Ck)(k=1,2,&#8230;K),\u63a5\u7740\u5b66\u4e60\u5230\u6761\u4ef6\u6982\u7387\u5206\u5e03P(X=x|Y=Ck)=P(X1=x1,X2=x2,&#8230;Xn=xn|Y=Ck)P(X=x|Y=Ck)=P(X1=x1,X2=x2,&#8230;Xn=xn|Y=Ck),\u7136\u540e\u6211\u4eec\u5c31\u53ef\u4ee5\u7528\u8d1d\u53f6\u65af\u516c\u5f0f\u5f97\u5230X\u548cY\u7684\u8054\u5408\u5206\u5e03P(X,Y)\u4e86\u3002\u8054\u5408\u5206\u5e03P(X,Y)\u5b9a\u4e49\u4e3a\uff1a<\/p>\n\n\n\n<p>P(X,Y=Ck)=P(Y=Ck)P(X=x|Y=Ck)=P(Y=Ck)P(X1=x1,X2=x2,&#8230;Xn=xn|Y=Ck)(1)(2)(1)P(X,Y=Ck)=P(Y=Ck)P(X=x|Y=Ck)(2)=P(Y=Ck)P(X1=x1,X2=x2,&#8230;Xn=xn|Y=Ck)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u4ece\u4e0a\u9762\u7684\u5f0f\u5b50\u53ef\u4ee5\u770b\u51faP(Y=Ck)P(Y=Ck)\u6bd4\u8f83\u5bb9\u6613\u901a\u8fc7\u6700\u5927\u4f3c\u7136\u6cd5\u6c42\u51fa\uff0c\u5f97\u5230\u7684P(Y=Ck)P(Y=Ck)\u5c31\u662f\u7c7b\u522bCkCk\u5728\u8bad\u7ec3\u96c6\u91cc\u9762\u51fa\u73b0\u7684\u9891\u6570\u3002\u4f46\u662fP(X1=x1,X2=x2,&#8230;Xn=xn|Y=Ck)P(X1=x1,X2=x2,&#8230;Xn=xn|Y=Ck)\u5f88\u96be\u6c42\u51fa,\u8fd9\u662f\u4e00\u4e2a\u8d85\u7ea7\u590d\u6742\u7684\u6709n\u4e2a\u7ef4\u5ea6\u7684\u6761\u4ef6\u5206\u5e03\u3002\u6734\u7d20\u8d1d\u53f6\u65af\u6a21\u578b\u5728\u8fd9\u91cc\u505a\u4e86\u4e00\u4e2a\u5927\u80c6\u7684\u5047\u8bbe\uff0c\u5373X\u7684n\u4e2a\u7ef4\u5ea6\u4e4b\u95f4\u76f8\u4e92\u72ec\u7acb\uff0c\u8fd9\u6837\u5c31\u53ef\u4ee5\u5f97\u51fa:<\/p>\n\n\n\n<p>P(X1=x1,X2=x2,&#8230;Xn=xn|Y=Ck)=P(X1=x1|Y=Ck)P(X2=x2|Y=Ck)&#8230;P(Xn=xn|Y=Ck)P(X1=x1,X2=x2,&#8230;Xn=xn|Y=Ck)=P(X1=x1|Y=Ck)P(X2=x2|Y=Ck)&#8230;P(Xn=xn|Y=Ck)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u4ece\u4e0a\u5f0f\u53ef\u4ee5\u770b\u51fa\uff0c\u8fd9\u4e2a\u5f88\u96be\u7684\u6761\u4ef6\u5206\u5e03\u5927\u5927\u7684\u7b80\u5316\u4e86\uff0c\u4f46\u662f\u8fd9\u4e5f\u53ef\u80fd\u5e26\u6765\u9884\u6d4b\u7684\u4e0d\u51c6\u786e\u6027\u3002\u4f60\u4f1a\u8bf4\u5982\u679c\u6211\u7684\u7279\u5f81\u4e4b\u95f4\u975e\u5e38\u4e0d\u72ec\u7acb\u600e\u4e48\u529e\uff1f\u5982\u679c\u771f\u662f\u975e\u5e38\u4e0d\u72ec\u7acb\u7684\u8bdd\uff0c\u90a3\u5c31\u5c3d\u91cf\u4e0d\u8981\u4f7f\u7528\u6734\u7d20\u8d1d\u53f6\u65af\u6a21\u578b\u4e86\uff0c\u8003\u8651\u4f7f\u7528\u5176\u4ed6\u7684\u5206\u7c7b\u65b9\u6cd5\u6bd4\u8f83\u597d\u3002\u4f46\u662f\u4e00\u822c\u60c5\u51b5\u4e0b\uff0c\u6837\u672c\u7684\u7279\u5f81\u4e4b\u95f4\u72ec\u7acb\u8fd9\u4e2a\u6761\u4ef6\u7684\u786e\u662f\u5f31\u6210\u7acb\u7684\uff0c\u5c24\u5176\u662f\u6570\u636e\u91cf\u975e\u5e38\u5927\u7684\u65f6\u5019\u3002\u867d\u7136\u6211\u4eec\u727a\u7272\u4e86\u51c6\u786e\u6027\uff0c\u4f46\u662f\u5f97\u5230\u7684\u597d\u5904\u662f\u6a21\u578b\u7684\u6761\u4ef6\u5206\u5e03\u7684\u8ba1\u7b97\u5927\u5927\u7b80\u5316\u4e86\uff0c\u8fd9\u5c31\u662f\u8d1d\u53f6\u65af\u6a21\u578b\u7684\u9009\u62e9\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u6700\u540e\u56de\u5230\u6211\u4eec\u8981\u89e3\u51b3\u7684\u95ee\u9898\uff0c\u6211\u4eec\u7684\u95ee\u9898\u662f\u7ed9\u5b9a\u6d4b\u8bd5\u96c6\u7684\u4e00\u4e2a\u65b0\u6837\u672c\u7279\u5f81(x(test)1,x(test)2,&#8230;x(test)n(x1(test),x2(test),&#8230;xn(test)\uff0c\u6211\u4eec\u5982\u4f55\u5224\u65ad\u5b83\u5c5e\u4e8e\u54ea\u4e2a\u7c7b\u578b\uff1f<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u65e2\u7136\u662f\u8d1d\u53f6\u65af\u6a21\u578b\uff0c\u5f53\u7136\u662f\u540e\u9a8c\u6982\u7387\u6700\u5927\u5316\u6765\u5224\u65ad\u5206\u7c7b\u4e86\u3002\u6211\u4eec\u53ea\u8981\u8ba1\u7b97\u51fa\u6240\u6709\u7684K\u4e2a\u6761\u4ef6\u6982\u7387P(Y=Ck|X=X(test))P(Y=Ck|X=X(test)),\u7136\u540e\u627e\u51fa\u6700\u5927\u7684\u6761\u4ef6\u6982\u7387\u5bf9\u5e94\u7684\u7c7b\u522b\uff0c\u8fd9\u5c31\u662f\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u9884\u6d4b\u4e86\u3002<\/p>\n\n\n\n<h1 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"3_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%9A%84%E6%8E%A8%E6%96%AD%E8%BF%87%E7%A8%8B\"><\/span>3.&nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u63a8\u65ad\u8fc7\u7a0b<span class=\"ez-toc-section-end\"><\/span><\/h1>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u4e0a\u8282\u6211\u4eec\u5df2\u7ecf\u5bf9\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u6a21\u578b\u4e5f\u9884\u6d4b\u65b9\u6cd5\u505a\u4e86\u4e00\u4e2a\u5927\u6982\u7684\u89e3\u91ca\uff0c\u8fd9\u91cc\u6211\u4eec\u5bf9\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u63a8\u65ad\u8fc7\u7a0b\u505a\u4e00\u4e2a\u5b8c\u6574\u7684\u8be0\u91ca\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u6211\u4eec\u9884\u6d4b\u7684\u7c7b\u522bCresultCresult\u662f\u4f7fP(Y=Ck|X=X(test))P(Y=Ck|X=X(test))\u6700\u5927\u5316\u7684\u7c7b\u522b\uff0c\u6570\u5b66\u8868\u8fbe\u5f0f\u4e3a\uff1a<\/p>\n\n\n\n<p>Cresult=argmax\ue152\ue153\ue151\ue150\ue154\ue154\ue154\ue154\ue154\ue154CkP(Y=Ck|X=X(test))=argmax\ue152\ue153\ue151\ue150\ue154\ue154\ue154\ue154\ue154\ue154CkP(X=X(test)|Y=Ck)P(Y=Ck)\/P(X=X(test))(3)(4)(3)Cresult=argmax\u23dfCkP(Y=Ck|X=X(test))(4)=argmax\u23dfCkP(X=X(test)|Y=Ck)P(Y=Ck)\/P(X=X(test))<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u7531\u4e8e\u5bf9\u4e8e\u6240\u6709\u7684\u7c7b\u522b\u8ba1\u7b97P(Y=Ck|X=X(test))P(Y=Ck|X=X(test))\u65f6\uff0c\u4e0a\u5f0f\u7684\u5206\u6bcd\u662f\u4e00\u6837\u7684\uff0c\u90fd\u662fP(X=X(test)P(X=X(test)\uff0c\u56e0\u6b64\uff0c\u6211\u4eec\u7684\u9884\u6d4b\u516c\u5f0f\u53ef\u4ee5\u7b80\u5316\u4e3a\uff1a<\/p>\n\n\n\n<p>Cresult=argmax\ue152\ue153\ue151\ue150\ue154\ue154\ue154\ue154\ue154\ue154CkP(X=X(test)|Y=Ck)P(Y=Ck)Cresult=argmax\u23dfCkP(X=X(test)|Y=Ck)P(Y=Ck)\u3000\u3000\u3000<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u63a5\u7740\u6211\u4eec\u5229\u7528\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u72ec\u7acb\u6027\u5047\u8bbe\uff0c\u5c31\u53ef\u4ee5\u5f97\u5230\u901a\u5e38\u610f\u4e49\u4e0a\u7684\u6734\u7d20\u8d1d\u53f6\u65af\u63a8\u65ad\u516c\u5f0f:<\/p>\n\n\n\n<p>Cresult=argmax\ue152\ue153\ue151\ue150\ue154\ue154\ue154\ue154\ue154\ue154CkP(Y=Ck)\u220fj=1nP(Xj=X(test)j|Y=Ck)Cresult=argmax\u23dfCkP(Y=Ck)\u220fj=1nP(Xj=Xj(test)|Y=Ck)<\/p>\n\n\n\n<h1 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"4_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%9A%84%E5%8F%82%E6%95%B0%E4%BC%B0%E8%AE%A1\"><\/span>4.&nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u53c2\u6570\u4f30\u8ba1<span class=\"ez-toc-section-end\"><\/span><\/h1>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5728\u4e0a\u4e00\u8282\u4e2d\uff0c\u6211\u4eec\u77e5\u9053\u53ea\u8981\u6c42\u51faP(Y=Ck)\u548cP(Xj=X(test)j|Y=Ck)(j=1,2,&#8230;n)P(Y=Ck)\u548cP(Xj=Xj(test)|Y=Ck)(j=1,2,&#8230;n)\uff0c\u6211\u4eec\u901a\u8fc7\u6bd4\u8f83\u5c31\u53ef\u4ee5\u5f97\u5230\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u63a8\u65ad\u7ed3\u679c\u3002\u8fd9\u4e00\u8282\u6211\u4eec\u5c31\u8ba8\u8bba\u600e\u4e48\u901a\u8fc7\u8bad\u7ec3\u96c6\u8ba1\u7b97\u8fd9\u4e24\u4e2a\u6982\u7387\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5bf9\u4e8eP(Y=Ck)P(Y=Ck),\u6bd4\u8f83\u7b80\u5355\uff0c\u901a\u8fc7\u6781\u5927\u4f3c\u7136\u4f30\u8ba1\u6211\u4eec\u5f88\u5bb9\u6613\u5f97\u5230P(Y=Ck)P(Y=Ck)\u4e3a\u6837\u672c\u7c7b\u522bCkCk\u51fa\u73b0\u7684\u9891\u7387\uff0c\u5373\u6837\u672c\u7c7b\u522bCkCk\u51fa\u73b0\u7684\u6b21\u6570mkmk\u9664\u4ee5\u6837\u672c\u603b\u6570m\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5bf9\u4e8eP(Xj=X(test)j|Y=Ck)(j=1,2,&#8230;n)P(Xj=Xj(test)|Y=Ck)(j=1,2,&#8230;n),\u8fd9\u4e2a\u53d6\u51b3\u4e8e\u6211\u4eec\u7684\u5148\u9a8c\u6761\u4ef6\uff1a<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000a)&nbsp;\u5982\u679c\u6211\u4eec\u7684XjXj\u662f\u79bb\u6563\u7684\u503c\uff0c\u90a3\u4e48\u6211\u4eec\u53ef\u4ee5\u5047\u8bbeXjXj\u7b26\u5408\u591a\u9879\u5f0f\u5206\u5e03\uff0c\u8fd9\u6837\u5f97\u5230P(Xj=X(test)j|Y=Ck)P(Xj=Xj(test)|Y=Ck)&nbsp;\u662f\u5728\u6837\u672c\u7c7b\u522bCkCk\u4e2d\uff0c\u7279\u5f81X(test)jXj(test)\u51fa\u73b0\u7684\u9891\u7387\u3002\u5373\uff1a<\/p>\n\n\n\n<p>P(Xj=X(test)j|Y=Ck)=mkjtestmkP(Xj=Xj(test)|Y=Ck)=mkjtestmk<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5176\u4e2dmkmk\u4e3a\u6837\u672c\u7c7b\u522bCkCk\u603b\u7684\u7279\u5f81\u8ba1\u6570\uff0c\u800cmkjtestmkjtest\u4e3a\u7c7b\u522b\u4e3aCkCk\u7684\u6837\u672c\u4e2d\uff0c\u7b2cj\u7ef4\u7279\u5f81X(test)jXj(test)\u51fa\u73b0\u7684\u8ba1\u6570\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u67d0\u4e9b\u65f6\u5019\uff0c\u53ef\u80fd\u67d0\u4e9b\u7c7b\u522b\u5728\u6837\u672c\u4e2d\u6ca1\u6709\u51fa\u73b0\uff0c\u8fd9\u6837\u53ef\u80fd\u5bfc\u81f4P(Xj=X(test)j|Y=Ck)P(Xj=Xj(test)|Y=Ck)\u4e3a0\uff0c\u8fd9\u6837\u4f1a\u5f71\u54cd\u540e\u9a8c\u7684\u4f30\u8ba1\uff0c\u4e3a\u4e86\u89e3\u51b3\u8fd9\u79cd\u60c5\u51b5\uff0c\u6211\u4eec\u5f15\u5165\u4e86\u62c9\u666e\u62c9\u65af\u5e73\u6ed1\uff0c\u5373\u6b64\u65f6\u6709\uff1a<\/p>\n\n\n\n<p>P(Xj=X(test)j|Y=Ck)=mkjtest+\u03bbmk+Oj\u03bbP(Xj=Xj(test)|Y=Ck)=mkjtest+\u03bbmk+Oj\u03bb\u3000\u3000\u3000<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5176\u4e2d\u03bb\u03bb&nbsp;\u4e3a\u4e00\u4e2a\u5927\u4e8e0\u7684\u5e38\u6570\uff0c\u5e38\u5e38\u53d6\u4e3a1\u3002OjOj\u4e3a\u7b2cj\u4e2a\u7279\u5f81\u7684\u53d6\u503c\u4e2a\u6570\u3002<\/p>\n\n\n\n<p>\u3000<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000b)\u5982\u679c\u6211\u4eec\u6211\u4eec\u7684XjXj\u662f\u975e\u5e38\u7a00\u758f\u7684\u79bb\u6563\u503c\uff0c\u5373\u5404\u4e2a\u7279\u5f81\u51fa\u73b0\u6982\u7387\u5f88\u4f4e\uff0c\u8fd9\u65f6\u6211\u4eec\u53ef\u4ee5\u5047\u8bbeXjXj\u7b26\u5408\u4f2f\u52aa\u5229\u5206\u5e03\uff0c\u5373\u7279\u5f81XjXj\u51fa\u73b0\u8bb0\u4e3a1\uff0c\u4e0d\u51fa\u73b0\u8bb0\u4e3a0\u3002\u5373\u53ea\u8981XjXj\u51fa\u73b0\u5373\u53ef\uff0c\u6211\u4eec\u4e0d\u5173\u6ce8XjXj\u7684\u6b21\u6570\u3002\u8fd9\u6837\u5f97\u5230P(Xj=X(test)j|Y=Ck)P(Xj=Xj(test)|Y=Ck)&nbsp;\u662f\u5728\u6837\u672c\u7c7b\u522bCkCk\u4e2d\uff0cX(test)jXj(test)\u51fa\u73b0\u7684\u9891\u7387\u3002\u6b64\u65f6\u6709\uff1a<\/p>\n\n\n\n<p>P(Xj=X(test)j|Y=Ck)=P(Xj=1|Y=Ck)X(test)j+(1\u2212P(Xj=1|Y=Ck))(1\u2212X(test)j)P(Xj=Xj(test)|Y=Ck)=P(Xj=1|Y=Ck)Xj(test)+(1\u2212P(Xj=1|Y=Ck))(1\u2212Xj(test))<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5176\u4e2d\uff0cX(test)jXj(test)\u53d6\u503c\u4e3a0\u548c1\u3002<\/p>\n\n\n\n<p>\u3000\u3000<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000c)\u5982\u679c\u6211\u4eec\u6211\u4eec\u7684XjXj\u662f\u8fde\u7eed\u503c\uff0c\u6211\u4eec\u901a\u5e38\u53d6XjXj\u7684\u5148\u9a8c\u6982\u7387\u4e3a\u6b63\u6001\u5206\u5e03\uff0c\u5373\u5728\u6837\u672c\u7c7b\u522bCkCk\u4e2d\uff0cXjXj\u7684\u503c\u7b26\u5408\u6b63\u6001\u5206\u5e03\u3002\u8fd9\u6837P(Xj=X(test)j|Y=Ck)P(Xj=Xj(test)|Y=Ck)\u7684\u6982\u7387\u5206\u5e03\u662f\uff1a<\/p>\n\n\n\n<p>P(Xj=X(test)j|Y=Ck)=12\u03c0\u03c32k\u2212\u2212\u2212\u2212\u221aexp(\u2212(X(test)j\u2212\u03bck)22\u03c32k)P(Xj=Xj(test)|Y=Ck)=12\u03c0\u03c3k2exp(\u2212(Xj(test)\u2212\u03bck)22\u03c3k2)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5176\u4e2d\u03bck\u548c\u03c32k\u03bck\u548c\u03c3k2\u662f\u6b63\u6001\u5206\u5e03\u7684\u671f\u671b\u548c\u65b9\u5dee\uff0c\u53ef\u4ee5\u901a\u8fc7\u6781\u5927\u4f3c\u7136\u4f30\u8ba1\u6c42\u5f97\u3002\u03bck\u03bck\u4e3a\u5728\u6837\u672c\u7c7b\u522bCkCk\u4e2d\uff0c\u6240\u6709XjXj\u7684\u5e73\u5747\u503c\u3002\u03c32k\u03c3k2\u4e3a\u5728\u6837\u672c\u7c7b\u522bCkCk\u4e2d\uff0c\u6240\u6709XjXj\u7684\u65b9\u5dee\u3002\u5bf9\u4e8e\u4e00\u4e2a\u8fde\u7eed\u7684\u6837\u672c\u503c\uff0c\u5e26\u5165\u6b63\u6001\u5206\u5e03\u7684\u516c\u5f0f\uff0c\u5c31\u53ef\u4ee5\u6c42\u51fa\u6982\u7387\u5206\u5e03\u4e86\u3002<\/p>\n\n\n\n<h1 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"5_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%AE%97%E6%B3%95%E8%BF%87%E7%A8%8B\"><\/span>5. &nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u8fc7\u7a0b<span class=\"ez-toc-section-end\"><\/span><\/h1>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u6211\u4eec\u5047\u8bbe\u8bad\u7ec3\u96c6\u4e3am\u4e2a\u6837\u672cn\u4e2a\u7ef4\u5ea6\uff0c\u5982\u4e0b\uff1a<\/p>\n\n\n\n<p>(x(1)1,x(1)2,&#8230;x(1)n,y1),(x(2)1,x(2)2,&#8230;x(2)n,y2),&#8230;(x(m)1,x(m)2,&#8230;x(m)n,ym)(x1(1),x2(1),&#8230;xn(1),y1),(x1(2),x2(2),&#8230;xn(2),y2),&#8230;(x1(m),x2(m),&#8230;xn(m),ym)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u5171\u6709K\u4e2a\u7279\u5f81\u8f93\u51fa\u7c7b\u522b\uff0c\u5206\u522b\u4e3aC1,C2,&#8230;,CKC1,C2,&#8230;,CK,\u6bcf\u4e2a\u7279\u5f81\u8f93\u51fa\u7c7b\u522b\u7684\u6837\u672c\u4e2a\u6570\u4e3am1,m2,&#8230;,mKm1,m2,&#8230;,mK,\u5728\u7b2ck\u4e2a\u7c7b\u522b\u4e2d\uff0c\u5982\u679c\u662f\u79bb\u6563\u7279\u5f81\uff0c\u5219\u7279\u5f81XjXj\u5404\u4e2a\u7c7b\u522b\u53d6\u503c\u4e3amkjlmkjl\u3002\u5176\u4e2dl\u53d6\u503c\u4e3a1,2,&#8230;Sj1,2,&#8230;Sj\uff0cSjSj\u4e3a\u7279\u5f81j\u4e0d\u540c\u7684\u53d6\u503c\u6570\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u8f93\u51fa\u4e3a\u5b9e\u4f8bX(test)X(test)\u7684\u5206\u7c7b\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u7b97\u6cd5\u6d41\u7a0b\u5982\u4e0b\uff1a<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30001) \u5982\u679c\u6ca1\u6709Y\u7684\u5148\u9a8c\u6982\u7387\uff0c\u5219\u8ba1\u7b97Y\u7684K\u4e2a\u5148\u9a8c\u6982\u7387\uff1aP(Y=Ck)=(mk+\u03bb)\/(m+K\u03bb)P(Y=Ck)=(mk+\u03bb)\/(m+K\u03bb)\uff0c\u5426\u5219P(Y=Ck)P(Y=Ck)\u4e3a\u8f93\u5165\u7684\u5148\u9a8c\u6982\u7387\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30002) \u5206\u522b\u8ba1\u7b97\u7b2ck\u4e2a\u7c7b\u522b\u7684\u7b2cj\u7ef4\u7279\u5f81\u7684\u7b2cl\u4e2a\u4e2a\u53d6\u503c\u6761\u4ef6\u6982\u7387\uff1aP(Xj=xjl|Y=Ck)P(Xj=xjl|Y=Ck)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u3000\u3000a)\u5982\u679c\u662f\u79bb\u6563\u503c:<\/p>\n\n\n\n<p>P(Xj=xjl|Y=Ck)=mkjl+\u03bbmk+Sj\u03bbP(Xj=xjl|Y=Ck)=mkjl+\u03bbmk+Sj\u03bb<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u3000\u3000\u03bb\u03bb\u53ef\u4ee5\u53d6\u503c\u4e3a1\uff0c\u6216\u8005\u5176\u4ed6\u5927\u4e8e0\u7684\u6570\u5b57\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u3000\u3000b)\u5982\u679c\u662f\u7a00\u758f\u4e8c\u9879\u79bb\u6563\u503c:<\/p>\n\n\n\n<p>P(Xj=xjl|Y=Ck)=P(j|Y=Ck)xjl+(1\u2212P(j|Y=Ck)(1\u2212xjl)P(Xj=xjl|Y=Ck)=P(j|Y=Ck)xjl+(1\u2212P(j|Y=Ck)(1\u2212xjl)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u3000\u3000 \u6b64\u65f6ll\u53ea\u6709\u4e24\u79cd\u53d6\u503c\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u3000\u3000c)\u5982\u679c\u662f\u8fde\u7eed\u503c\u4e0d\u9700\u8981\u8ba1\u7b97\u5404\u4e2al\u7684\u53d6\u503c\u6982\u7387\uff0c\u76f4\u63a5\u6c42\u6b63\u6001\u5206\u5e03\u7684\u53c2\u6570:<\/p>\n\n\n\n<p>P(Xj=xj|Y=Ck)=12\u03c0\u03c32k\u2212\u2212\u2212\u2212\u221aexp(\u2212(xj\u2212\u03bck)22\u03c32k)P(Xj=xj|Y=Ck)=12\u03c0\u03c3k2exp(\u2212(xj\u2212\u03bck)22\u03c3k2)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u3000\u3000\u9700\u8981\u6c42\u51fa\u03bck\u548c\u03c32k\u03bck\u548c\u03c3k2\u3002&nbsp;\u03bck\u03bck\u4e3a\u5728\u6837\u672c\u7c7b\u522bCkCk\u4e2d\uff0c\u6240\u6709XjXj\u7684\u5e73\u5747\u503c\u3002\u03c32k\u03c3k2\u4e3a\u5728\u6837\u672c\u7c7b\u522bCkCk\u4e2d\uff0c\u6240\u6709XjXj\u7684\u65b9\u5dee\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30003\uff09\u5bf9\u4e8e\u5b9e\u4f8bX(test)X(test)\uff0c\u5206\u522b\u8ba1\u7b97\uff1a<\/p>\n\n\n\n<p>P(Y=Ck)\u220fj=1nP(Xj=x(test)j|Y=Ck)P(Y=Ck)\u220fj=1nP(Xj=xj(test)|Y=Ck)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30004\uff09\u786e\u5b9a\u5b9e\u4f8bX(test)X(test)\u7684\u5206\u7c7bCresultCresult<\/p>\n\n\n\n<p>Cresult=argmax\ue152\ue153\ue151\ue150\ue154\ue154\ue154\ue154\ue154\ue154CkP(Y=Ck)\u220fj=1nP(Xj=X(test)j|Y=Ck)Cresult=argmax\u23dfCkP(Y=Ck)\u220fj=1nP(Xj=Xj(test)|Y=Ck)<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000 \u4ece\u4e0a\u9762\u7684\u8ba1\u7b97\u53ef\u4ee5\u770b\u51fa\uff0c\u6ca1\u6709\u590d\u6742\u7684\u6c42\u5bfc\u548c\u77e9\u9635\u8fd0\u7b97\uff0c\u56e0\u6b64\u6548\u7387\u5f88\u9ad8\u3002<\/p>\n\n\n\n<h1 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"6_%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E7%AE%97%E6%B3%95%E5%B0%8F%E7%BB%93\"><\/span>6.&nbsp;&nbsp;\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u5c0f\u7ed3<span class=\"ez-toc-section-end\"><\/span><\/h1>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u7684\u4e3b\u8981\u539f\u7406\u57fa\u672c\u5df2\u7ecf\u505a\u4e86\u603b\u7ed3\uff0c\u8fd9\u91cc\u5bf9\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u4f18\u7f3a\u70b9\u505a\u4e00\u4e2a\u603b\u7ed3\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u4e3b\u8981\u4f18\u70b9\u6709\uff1a<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30001\uff09\u6734\u7d20\u8d1d\u53f6\u65af\u6a21\u578b\u53d1\u6e90\u4e8e\u53e4\u5178\u6570\u5b66\u7406\u8bba\uff0c\u6709\u7a33\u5b9a\u7684\u5206\u7c7b\u6548\u7387\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30002\uff09\u5bf9\u5c0f\u89c4\u6a21\u7684\u6570\u636e\u8868\u73b0\u5f88\u597d\uff0c\u80fd\u4e2a\u5904\u7406\u591a\u5206\u7c7b\u4efb\u52a1\uff0c\u9002\u5408\u589e\u91cf\u5f0f\u8bad\u7ec3\uff0c\u5c24\u5176\u662f\u6570\u636e\u91cf\u8d85\u51fa\u5185\u5b58\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u4e00\u6279\u6279\u7684\u53bb\u589e\u91cf\u8bad\u7ec3\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30003\uff09\u5bf9\u7f3a\u5931\u6570\u636e\u4e0d\u592a\u654f\u611f\uff0c\u7b97\u6cd5\u4e5f\u6bd4\u8f83\u7b80\u5355\uff0c\u5e38\u7528\u4e8e\u6587\u672c\u5206\u7c7b\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u6734\u7d20\u8d1d\u53f6\u65af\u7684\u4e3b\u8981\u7f3a\u70b9\u6709\uff1a\u3000\u3000\u3000<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30001\uff09 \u7406\u8bba\u4e0a\uff0c\u6734\u7d20\u8d1d\u53f6\u65af\u6a21\u578b\u4e0e\u5176\u4ed6\u5206\u7c7b\u65b9\u6cd5\u76f8\u6bd4\u5177\u6709\u6700\u5c0f\u7684\u8bef\u5dee\u7387\u3002\u4f46\u662f\u5b9e\u9645\u4e0a\u5e76\u975e\u603b\u662f\u5982\u6b64\uff0c\u8fd9\u662f\u56e0\u4e3a\u6734\u7d20\u8d1d\u53f6\u65af\u6a21\u578b\u7ed9\u5b9a\u8f93\u51fa\u7c7b\u522b\u7684\u60c5\u51b5\u4e0b,\u5047\u8bbe\u5c5e\u6027\u4e4b\u95f4\u76f8\u4e92\u72ec\u7acb\uff0c\u8fd9\u4e2a\u5047\u8bbe\u5728\u5b9e\u9645\u5e94\u7528\u4e2d\u5f80\u5f80\u662f\u4e0d\u6210\u7acb\u7684\uff0c\u5728\u5c5e\u6027\u4e2a\u6570\u6bd4\u8f83\u591a\u6216\u8005\u5c5e\u6027\u4e4b\u95f4\u76f8\u5173\u6027\u8f83\u5927\u65f6\uff0c\u5206\u7c7b\u6548\u679c\u4e0d\u597d\u3002\u800c\u5728\u5c5e\u6027\u76f8\u5173\u6027\u8f83\u5c0f\u65f6\uff0c\u6734\u7d20\u8d1d\u53f6\u65af\u6027\u80fd\u6700\u4e3a\u826f\u597d\u3002\u5bf9\u4e8e\u8fd9\u4e00\u70b9\uff0c\u6709\u534a\u6734\u7d20\u8d1d\u53f6\u65af\u4e4b\u7c7b\u7684\u7b97\u6cd5\u901a\u8fc7\u8003\u8651\u90e8\u5206\u5173\u8054\u6027\u9002\u5ea6\u6539\u8fdb\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30002\uff09\u9700\u8981\u77e5\u9053\u5148\u9a8c\u6982\u7387\uff0c\u4e14\u5148\u9a8c\u6982\u7387\u5f88\u591a\u65f6\u5019\u53d6\u51b3\u4e8e\u5047\u8bbe\uff0c\u5047\u8bbe\u7684\u6a21\u578b\u53ef\u4ee5\u6709\u5f88\u591a\u79cd\uff0c\u56e0\u6b64\u5728\u67d0\u4e9b\u65f6\u5019\u4f1a\u7531\u4e8e\u5047\u8bbe\u7684\u5148\u9a8c\u6a21\u578b\u7684\u539f\u56e0\u5bfc\u81f4\u9884\u6d4b\u6548\u679c\u4e0d\u4f73\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30003\uff09\u7531\u4e8e\u6211\u4eec\u662f\u901a\u8fc7\u5148\u9a8c\u548c\u6570\u636e\u6765\u51b3\u5b9a\u540e\u9a8c\u7684\u6982\u7387\u4ece\u800c\u51b3\u5b9a\u5206\u7c7b\uff0c\u6240\u4ee5\u5206\u7c7b\u51b3\u7b56\u5b58\u5728\u4e00\u5b9a\u7684\u9519\u8bef\u7387\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u30004\uff09\u5bf9\u8f93\u5165\u6570\u636e\u7684\u8868\u8fbe\u5f62\u5f0f\u5f88\u654f\u611f\u3002<\/p>\n\n\n\n<p>\u3000\u3000\u3000\u3000\u4ee5\u4e0a\u5c31\u662f\u6734\u7d20\u8d1d\u53f6\u65af\u7b97\u6cd5\u7684\u4e00\u4e2a\u603b\u7ed3\uff0c\u5e0c\u671b\u53ef\u4ee5\u5e2e\u5230\u670b\u53cb\u4eec\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>\nimport warnings\r\nwarnings.filterwarnings(\"ignore\")\r\nimport pandas as pd\r\nimport jieba\r\ndf_news = pd.read_csv('data.csv')\r\ndf_news = df_news.dropna()  #\u5220\u9664\u7f3a\u5931\u503c\r\ndf_news.head()\ndf_news&#91;'content']=df_news&#91;'cus_comment'].str.replace('\\d+', '',regex=True) #\u66ff\u6362\r\ndf_news&#91;'content']=df_news&#91;'cus_comment'].str.replace('&#91;A-z]', '',regex=True) \r\ncontent = df_news.content.values.tolist() #\u5c06\u6bcf\u4e00\u7bc7\u6587\u7ae0\u8f6c\u6362\u6210\u4e00\u4e2alist \r\n'''\r\n#\u4e2d\u6587\u5206\u8bcd\u548c\u53bb\u505c\u7528\u8bcd\r\nimport csv\r\nstopwords=pd.read_csv('stopwords.txt',header=None,quoting = csv.QUOTE_NONE,delimiter=\"\\t\")\r\nstopwords.head()\r\nstopwords =  stopwords&#91;0].tolist()\r\nstopwords.append('\\r\\n')\r\n'''\r\ncontent_S = &#91;]\r\nfor line in content:\r\n    current_segment = jieba.lcut(line) #\u5bf9\u6bcf\u4e00\u7bc7\u6587\u7ae0\u8fdb\u884c\u5206\u8bcd\uff0c\u8fd4\u56de\u7684\u5217\u8868\r\n    #current_segment=&#91;segment for segment in current_segment if segment not in stopwords] #\u53bb\u505c\u7528\u8bcd\u540e\u7684\u5217\u8868\r\n    segments=\" \".join(current_segment) #\u628a\u6570\u7ec4\u4e2d\u7684\u6240\u6709\u5143\u7d20\u653e\u5165\u4e00\u4e2a\u5b57\u7b26\u4e32\r\n    content_S.append(segments) #\u4fdd\u5b58\u5206\u8bcd\u7684\u7ed3\u679c\r\ndf=pd.DataFrame({'contents_clean':content_S,'label':df_news&#91;'stars'].map(lambda x:zhuanhuan(x))})\r\ndf.head()#\u53bb\u6389\u505c\u7528\u8bcd\u540e\n#\u5206\u6570\u636e\u96c6\r\nfrom sklearn.model_selection import train_test_split\r\nx_train, x_test, y_train, y_test = train_test_split(df&#91;'contents_clean'].values, df&#91;'label'].values, random_state=1)\r\n\r\n#\u63d0\u53d6\u7279\u5f81\uff0c\u8bad\u7ec3\u6a21\u578b\uff0c\u8bc4\u4f30\u6a21\u578b\r\nfrom sklearn.feature_extraction.text import CountVectorizer\r\nvec = CountVectorizer()\r\ntrain_feature = vec.fit_transform(x_train) #\u8bad\u7ec3\u96c6\u8f6c\u6362\u4e3a\u5411\u91cf\r\ntrain_feature  #&lt;3750x87289 sparse matrix of type '&lt;class 'numpy.int64'>' with 461523 stored elements in Compressed Sparse Row format>\r\ntest_feature = vec.transform(x_test) #\u6d4b\u8bd5\u96c6\u8f6c\u6362\u4e3a\u5411\u91cf\r\ntest_feature.shape #(1250, 87289)\r\n\r\nfrom sklearn.naive_bayes import MultinomialNB #\u8d1d\u53f6\u65af\u6a21\u578b,\u591a\u9879\u5f0f\r\nclassifier = MultinomialNB() \r\nclassifier.fit(train_feature, y_train) #\u8bad\u7ec3\r\nclassifier.score(test_feature, y_test) #\u51c6\u786e\u7387 #0.8248\r\n\r\n#TF-IDF\u6a21\u578b\r\nfrom sklearn.feature_extraction.text import TfidfVectorizer\r\nvectorizer = TfidfVectorizer()\r\nvectorizer.fit(x_train)\r\ntrain_feature=vectorizer.transform(x_train)\r\ntest_feature = vectorizer.transform(x_test)\r\nclassifier = MultinomialNB()\r\nclassifier.fit(train_feature, y_train)\r\nclassifier.score(test_feature, y_test) #0.8264\r\n\r\n#\u7528\u7f51\u683c\u641c\u7d22\u4f18\u5316\r\nfrom sklearn.model_selection import GridSearchCV\r\nparams={'alpha':&#91;0.1,0.2,0.3,0.4]}\r\nclassifier = MultinomialNB()\r\ngrid_search=GridSearchCV(classifier,param_grid=params,cv=10)\r\ngrid_search.fit(train_feature, y_train)\r\ngrid_search.best_params_ #{'alpha': 0.1}\r\ngrid_search.score(test_feature, y_test)#\u6a21\u578b\u5f97\u5206,\u51c6\u786e\u7387 # 0.828\r\n\r\nlabel_mapping = {\"\u6c7d\u8f66\": 1, \"\u8d22\u7ecf\": 2, \"\u79d1\u6280\": 3, \"\u5065\u5eb7\": 4, \"\u4f53\u80b2\":5, \"\u6559\u80b2\": 6,\"\u6587\u5316\": 7,\"\u519b\u4e8b\": 8,\"\u5a31\u4e50\": 9,\"\u65f6\u5c1a\": 0}\r\nfrom sklearn.metrics import classification_report\r\npred_test=grid_search.predict(test_feature) #\u9884\u6d4b\u7ed3\u679c\r\nprint(classification_report(y_test, pred_test))\r\n\r\n#\u5206\u7c7b\u62a5\u544a\r\nfrom sklearn.metrics import classification_report\r\npred_test=grid_search.predict(test_feature) #\u9884\u6d4b\u7ed3\u679c\r\nprint(classification_report(y_test, pred_test))\r\n#\u6df7\u6dc6\u77e9\u9635\r\nfrom sklearn import metrics\r\nprint(metrics.confusion_matrix(y_test, pred_test))#\u6df7\u6dc6\u77e9\u9635\uff0c\u4e0d\u8fc7\u611f\u89c90.828\u5206\u7c7b\u7cbe\u5ea6\u8fd8\u4e0d\u662f\u5f88\u9ad8\u54e6\uff0c\u4ee5\u540e\u770b\u770b\u522b\u7684\u7b97\u6cd5\u80fd\u4e0d\u80fd\u66f4\u9ad8\r\n\r\n#https:\/\/zhuanlan.zhihu.com\/p\/395049069<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u4e00\u3001\u722c\u866b \u6574\u4f53\u601d\u8def \u722c\u53d6\u5927\u4f17\u70b9\u8bc4\u5341\u5927\u70ed\u95e8\u7cd6\u6c34\u5e97\u7684\u8bc4\u8bba\uff0c\u722c\u53d6\u7f51\u9875\u540e\u4ecehtml\u9875\u9762\u4e2d\u628a\u9700\u8981\u7684\u5b57\u6bb5\u4fe1\u606f\uff08\u987e&hellip; <a href=\"http:\/\/viplao.com\/index.php\/2022\/10\/16\/%e8%bf%90%e7%bb%b4%e5%b7%a5%e5%85%b7-%e5%a4%a7%e4%bc%97%e7%82%b9%e8%af%84%e8%af%84%e8%ae%ba%e6%96%87%e6%9c%ac%e6%8c%96%e6%8e%98%e5%9f%ba%e7%a1%80%e6%a1%88%e4%be%8b%e5%ae%9e%e8%b7%b5\/\" class=\"more-link read-more\" rel=\"bookmark\">\u7ee7\u7eed\u9605\u8bfb <span class=\"screen-reader-text\">\u8fd0\u7ef4\u5de5\u5177 &#8211; \u5927\u4f17\u70b9\u8bc4\u8bc4\u8bba\u6587\u672c\u6316\u6398\u57fa\u7840\u6848\u4f8b\u5b9e\u8df5<\/span><i class=\"fa fa-arrow-right\"><\/i><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[1],"tags":[],"views":670,"_links":{"self":[{"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/posts\/962"}],"collection":[{"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/comments?post=962"}],"version-history":[{"count":5,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/posts\/962\/revisions"}],"predecessor-version":[{"id":2298,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/posts\/962\/revisions\/2298"}],"wp:attachment":[{"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/media?parent=962"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/categories?post=962"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/tags?post=962"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}