{"id":984,"date":"2025-11-24T21:53:55","date_gmt":"2025-11-24T13:53:55","guid":{"rendered":"http:\/\/www.preluna.xyz\/?p=984"},"modified":"2025-11-27T15:19:10","modified_gmt":"2025-11-27T07:19:10","slug":"getwordpress1","status":"publish","type":"post","link":"http:\/\/www.preluna.xyz\/index.php\/2025\/11\/24\/getwordpress1\/preluna\/technology\/career-skills\/tech-writing\/","title":{"rendered":"WordPress\u6587\u7ae0\u63d0\u53d6\uff081\uff09"},"content":{"rendered":"\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th>\u811a\u672c\u540d\u79f0<\/th><th>\u4e3b\u8981\u529f\u80fd<\/th><th>\u5f53\u524d\u72b6\u6001<\/th><th>\u5907\u6ce8<\/th><\/tr><\/thead><tbody><tr><td><strong>check_environment.py<\/strong><\/td><td>\u68c0\u67e5Python\u73af\u5883\u548c\u4f9d\u8d56\u5305<\/td><td>\u5b8c\u6210<\/td><td>\u57fa\u7840\u73af\u5883\u9a8c\u8bc1<\/td><\/tr><tr><td><strong>test_wp_api.py<\/strong><\/td><td>\u6d4b\u8bd5WordPress REST API\u8fde\u63a5<\/td><td>\u5b8c\u6210<\/td><td>\u9a8c\u8bc1API\u53ef\u7528\u6027<\/td><\/tr><tr><td><strong>extract_content.py<\/strong><\/td><td>\u4f7f\u7528corpress\u5305\u63d0\u53d6\u5185\u5bb9<\/td><td>&nbsp;\u9700\u8981\u9a8c\u8bc1<\/td><td>\u6838\u5fc3\u63d0\u53d6\u529f\u80fd<\/td><\/tr><tr><td><strong>validate_extraction.py<\/strong><\/td><td>\u9a8c\u8bc1\u63d0\u53d6\u7ed3\u679c\u7684\u8d28\u91cf<\/td><td>\u9700\u8981\u9a8c\u8bc1<\/td><td>\u8d28\u91cf\u4fdd\u8bc1<\/td><\/tr><tr><td><strong>format_content.py<\/strong><\/td><td>\u6e05\u7406\u548c\u4f18\u5316\u63d0\u53d6\u7684\u5185\u5bb9<\/td><td>&nbsp;\u9700\u8981\u9a8c\u8bc1<\/td><td>\u540e\u5904\u7406<\/td><\/tr><tr><td><strong>run_integration_test.py<\/strong><\/td><td>\u96c6\u6210\u6d4b\u8bd5\u5b8c\u6574\u5de5\u4f5c\u6d41<\/td><td>\u9700\u8981\u9a8c\u8bc1<\/td><td>\u7aef\u5230\u7aef\u6d4b\u8bd5<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\"><strong>\u811a\u672c\u4f9d\u8d56\u5173\u7cfb<\/strong><\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code>check_environment.py \n    \u2193 (\u73af\u5883\u5c31\u7eea)\ntest_wp_api.py\n    \u2193 (API\u53ef\u7528)  \nextract_content.py\n    \u2193 (\u5185\u5bb9\u63d0\u53d6\u6210\u529f)\nvalidate_extraction.py\n    \u2193 (\u8d28\u91cf\u9a8c\u8bc1\u901a\u8fc7)\nformat_content.py<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">\u5f53\u524d\u4e3b\u8981\u95ee\u9898\u603b\u7ed3<\/h2>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>WordPress REST API \u914d\u7f6e\u95ee\u9898<\/strong>\n<ul class=\"wp-block-list\">\n<li>\u7f51\u7ad9\u53ef\u80fd\u6ca1\u6709\u6b63\u786e\u542f\u7528REST API<\/li>\n\n\n\n<li>\u8bbf\u95ee&nbsp;<code>\/wp-json\/<\/code>&nbsp;\u8fd4\u56de\u4e71\u7801\u800c\u4e0d\u662f\u7ed3\u6784\u5316\u7684JSON<\/li>\n\n\n\n<li>\u8fd9\u4f1a\u5f71\u54cd\u6240\u6709\u57fa\u4e8eAPI\u7684\u63d0\u53d6\u811a\u672c<\/li>\n<\/ul>\n<\/li>\n\n\n\n<li><strong>\u811a\u672c\u95f4\u517c\u5bb9\u6027\u95ee\u9898<\/strong>\n<ul class=\"wp-block-list\">\n<li>\u63d0\u53d6\u811a\u672c\u7684\u8f93\u51fa\u683c\u5f0f\u53ef\u80fd\u4e0e\u9a8c\u8bc1\/\u683c\u5f0f\u5316\u811a\u672c\u7684\u9884\u671f\u8f93\u5165\u4e0d\u5339\u914d<\/li>\n\n\n\n<li>\u7f3a\u5c11\u7edf\u4e00\u7684\u9519\u8bef\u5904\u7406\u548c\u6570\u636e\u4f20\u9012\u673a\u5236<\/li>\n<\/ul>\n<\/li>\n\n\n\n<li><strong>\u7f3a\u5c11\u6e10\u8fdb\u5f0f\u6d4b\u8bd5\u65b9\u6cd5<\/strong>\n<ul class=\"wp-block-list\">\n<li>\u6ca1\u6709\u4ece\u7b80\u5355\u5230\u590d\u6742\u7684\u6d4b\u8bd5\u7b56\u7565<\/li>\n\n\n\n<li>\u96be\u4ee5\u5b9a\u4f4d\u5177\u4f53\u54ea\u4e2a\u73af\u8282\u51fa\u73b0\u95ee\u9898<\/li>\n<\/ul>\n<\/li>\n<\/ol>\n\n\n\n<h2 class=\"wp-block-heading\">\u91cd\u65b0\u8bbe\u8ba1\u7684\u7b80\u5316\u5de5\u4f5c\u6d41<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>\u9636\u6bb51\uff1a\u57fa\u7840\u9a8c\u8bc1<\/strong><\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code><strong><em>python<\/em><\/strong>\n# \u7b2c\u4e00\u6b65\uff1a\u9a8c\u8bc1\u73af\u5883\npython check_environment.py\n\n# \u7b2c\u4e8c\u6b65\uff1a\u624b\u52a8\u9a8c\u8bc1API\n# \u5728\u6d4f\u89c8\u5668\u4e2d\u8bbf\u95ee: https:\/\/\u4f60\u7684\u7f51\u7ad9\/wp-json\/\n# \u786e\u8ba4\u8fd4\u56de\u7684\u662fJSON\u800c\u4e0d\u662f\u4e71\u7801<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>\u9636\u6bb52\uff1a\u5206\u6b65\u6d4b\u8bd5<\/strong><\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code><strong><em>python<\/em><\/strong>\n# \u521b\u5efa\u7b80\u5355\u7684\u6d4b\u8bd5\u811a\u672c test_basic.py\nimport requests\nimport json\n\ndef test_basic():\n    url = \"https:\/\/\u4f60\u7684\u7f51\u7ad9\/wp-json\/wp\/v2\/posts\"\n    try:\n        response = requests.get(url)\n        print(f\"\u72b6\u6001\u7801: {response.status_code}\")\n        print(f\"\u5185\u5bb9\u7c7b\u578b: {response.headers.get('content-type')}\")\n        \n        if response.status_code == 200:\n            data = response.json()\n            print(f\"\u627e\u5230 {len(data)} \u7bc7\u6587\u7ae0\")\n            return True\n        else:\n            print(f\"\u54cd\u5e94\u5185\u5bb9: {response.text&#91;:200]}\")  # \u663e\u793a\u524d200\u5b57\u7b26\n            return False\n    except Exception as e:\n        print(f\"\u9519\u8bef: {e}\")\n        return False\n\nif __name__ == \"__main__\":\n    test_basic()<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\"><strong>\u9636\u6bb53\uff1a\u7b80\u5316\u63d0\u53d6\u6d41\u7a0b<\/strong><\/h2>\n\n\n\n<p>\u5982\u679cAPI\u5de5\u4f5c\u6b63\u5e38\uff0c\u6211\u4eec\u53ef\u4ee5\u7b80\u5316\u63d0\u53d6\u6d41\u7a0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># simple_extract.py - \u7b80\u5316\u7248\u63d0\u53d6\u811a\u672c\nimport requests\nimport os\nimport json\nfrom datetime import datetime\n\ndef simple_extract(site_url, output_dir=\".\/simple_output\"):\n    \"\"\"\u7b80\u5316\u7248\u5185\u5bb9\u63d0\u53d6\"\"\"\n    \n    # \u521b\u5efa\u8f93\u51fa\u76ee\u5f55\n    os.makedirs(output_dir, exist_ok=True)\n    \n    # \u83b7\u53d6\u6587\u7ae0\u5217\u8868\n    posts_url = f\"{site_url}\/wp-json\/wp\/v2\/posts\"\n    \n    try:\n        response = requests.get(posts_url)\n        if response.status_code != 200:\n            print(f\"\u274c \u83b7\u53d6\u6587\u7ae0\u5931\u8d25: {response.status_code}\")\n            return False\n        \n        posts = response.json()\n        print(f\"\u2705 \u627e\u5230 {len(posts)} \u7bc7\u6587\u7ae0\")\n        \n        # \u63d0\u53d6\u6bcf\u7bc7\u6587\u7ae0\n        for i, post in enumerate(posts):\n            title = post.get('title', {}).get('rendered', f'\u6587\u7ae0_{i}')\n            content = post.get('content', {}).get('rendered', '')\n            \n            # \u7b80\u5355\u7684\u6587\u4ef6\u540d\n            filename = f\"post_{i+1}.txt\"\n            filepath = os.path.join(output_dir, filename)\n            \n            with open(filepath, 'w', encoding='utf-8') as f:\n                f.write(f\"\u6807\u9898: {title}\\n\\n\")\n                f.write(f\"\u5185\u5bb9: {content}\\n\")\n            \n            print(f\"\ud83d\udcc4 \u4fdd\u5b58: {filename}\")\n        \n        # \u4fdd\u5b58\u5143\u6570\u636e\n        meta = {\n            \"extracted_at\": datetime.now().isoformat(),\n            \"total_posts\": len(posts),\n            \"site_url\": site_url\n        }\n        \n        with open(os.path.join(output_dir, \"metadata.json\"), 'w', encoding='utf-8') as f:\n            json.dump(meta, f, ensure_ascii=False, indent=2)\n        \n        print(f\"\ud83c\udf89 \u63d0\u53d6\u5b8c\u6210! \u6587\u4ef6\u4fdd\u5b58\u5728: {output_dir}\")\n        return True\n        \n    except Exception as e:\n        print(f\"\u274c \u63d0\u53d6\u8fc7\u7a0b\u4e2d\u51fa\u9519: {e}\")\n        return False\n\nif __name__ == \"__main__\":\n    import sys\n    if len(sys.argv) != 2:\n        print(\"\u4f7f\u7528\u65b9\u6cd5: python simple_extract.py &lt;\u7f51\u7ad9\u5730\u5740&gt;\")\n        sys.exit(1)\n    \n    simple_extract(sys.argv&#91;1])<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\"><strong><em>\u6587\u4ef6\u5b58\u6863<\/em><\/strong><\/h2>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>\u73af\u5883\u68c0\u6d4b\u811a\u672c<\/strong>&nbsp;<code>check_environment.py<\/code><\/h3>\n\n\n\n<p>\u8fd9\u4e2a\u811a\u672c\u68c0\u67e5\u8fd0\u884c\u73af\u5883\u662f\u5426\u6ee1\u8db3\u8981\u6c42<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># check_environment.py\nimport sys\nimport subprocess\nimport pkg_resources\n\ndef check_python_version():\n    \"\"\"\u68c0\u67e5Python\u7248\u672c\"\"\"\n    print(\"\ud83d\udd0d \u68c0\u67e5Python\u7248\u672c...\")\n    version = sys.version_info\n    if version.major &gt;= 3 and version.minor &gt;= 6:\n        print(f\"\u2705 Python\u7248\u672c: {sys.version}\")\n        return True\n    else:\n        print(f\"\u274c Python\u7248\u672c\u8fc7\u4f4e: {sys.version}\")\n        print(\"   \u9700\u8981 Python 3.6 \u6216\u66f4\u9ad8\u7248\u672c\")\n        return False\n\ndef check_dependencies():\n    \"\"\"\u68c0\u67e5\u5fc5\u8981\u7684\u4f9d\u8d56\u5305\"\"\"\n    print(\"\\n\ud83d\udd0d \u68c0\u67e5\u4f9d\u8d56\u5305...\")\n    required_packages = &#91;'requests', 'corpress', 'beautifulsoup4']\n    missing_packages = &#91;]\n    \n    for package in required_packages:\n        try:\n            pkg_resources.require(package)\n            print(f\"\u2705 {package}\")\n        except:\n            missing_packages.append(package)\n            print(f\"\u274c {package}\")\n    \n    if missing_packages:\n        print(f\"\\n\ud83d\udce6 \u9700\u8981\u5b89\u88c5\u7f3a\u5931\u7684\u5305:\")\n        print(f\"pip install {' '.join(missing_packages)}\")\n        return False\n    else:\n        print(\"\u2705 \u6240\u6709\u4f9d\u8d56\u5305\u5df2\u5b89\u88c5\")\n        return True\n\ndef install_dependencies():\n    \"\"\"\u81ea\u52a8\u5b89\u88c5\u7f3a\u5931\u7684\u4f9d\u8d56\u5305\"\"\"\n    print(\"\\n\ud83d\udce6 \u6b63\u5728\u5b89\u88c5\u7f3a\u5931\u7684\u4f9d\u8d56\u5305...\")\n    try:\n        subprocess.check_call(&#91;sys.executable, \"-m\", \"pip\", \"install\", \"requests\", \"corpress\", \"beautifulsoup4\"])\n        print(\"\u2705 \u4f9d\u8d56\u5305\u5b89\u88c5\u5b8c\u6210\")\n        return True\n    except subprocess.CalledProcessError as e:\n        print(f\"\u274c \u5b89\u88c5\u5931\u8d25: {e}\")\n        return False\n\ndef main():\n    print(\"=\" * 50)\n    print(\"\ud83d\udee0\ufe0f  WordPress\u5185\u5bb9\u63d0\u53d6 - \u73af\u5883\u68c0\u6d4b\")\n    print(\"=\" * 50)\n    \n    # \u68c0\u67e5Python\u7248\u672c\n    if not check_python_version():\n        sys.exit(1)\n    \n    # \u68c0\u67e5\u4f9d\u8d56\u5305\n    if not check_dependencies():\n        if input(\"\\n\u662f\u5426\u81ea\u52a8\u5b89\u88c5\u7f3a\u5931\u7684\u4f9d\u8d56\u5305? (y\/n): \").lower() == 'y':\n            if not install_dependencies():\n                sys.exit(1)\n        else:\n            print(\"\u8bf7\u624b\u52a8\u5b89\u88c5\u7f3a\u5931\u7684\u4f9d\u8d56\u5305\u540e\u91cd\u65b0\u8fd0\u884c\")\n            sys.exit(1)\n    \n    print(\"\\n\ud83c\udf89 \u73af\u5883\u68c0\u6d4b\u901a\u8fc7\uff01\u53ef\u4ee5\u8fd0\u884c\u540e\u7eed\u811a\u672c\")\n    return True\n\nif __name__ == \"__main__\":\n    main()<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>API\u6d4b\u8bd5\u811a\u672c<\/strong>&nbsp;<code>test_wp_api.py<\/code><\/h3>\n\n\n\n<p>\u8fd9\u4e2a\u811a\u672c\u4e13\u95e8\u6d4b\u8bd5WordPress REST API\u7684\u8fde\u63a5\u72b6\u6001\u548c\u529f\u80fd<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># test_wp_api.py\nimport requests\nimport json\nimport sys\nfrom urllib.parse import urljoin\n\ndef test_basic_api(site_url):\n    \"\"\"\u6d4b\u8bd5\u57fa\u672c\u7684API\u8fde\u63a5\"\"\"\n    print(f\"\\n\ud83d\udd17 \u6d4b\u8bd5API\u8fde\u63a5: {site_url}\")\n    \n    # \u6d4b\u8bd5\u4e3b\u8981API\u7aef\u70b9\n    api_url = urljoin(site_url, \"\/wp-json\/\")\n    try:\n        response = requests.get(api_url, timeout=10)\n        if response.status_code == 200:\n            print(\"\u2705 REST API \u8fde\u63a5\u6210\u529f!\")\n            return True\n        else:\n            print(f\"\u274c API\u8fd4\u56de\u72b6\u6001\u7801: {response.status_code}\")\n            return False\n    except requests.exceptions.RequestException as e:\n        print(f\"\u274c \u8fde\u63a5\u5931\u8d25: {e}\")\n        return False\n\ndef test_posts_api(site_url):\n    \"\"\"\u6d4b\u8bd5\u6587\u7ae0API\u7aef\u70b9\"\"\"\n    print(f\"\\n\ud83d\udcdd \u6d4b\u8bd5\u6587\u7ae0API...\")\n    \n    posts_url = urljoin(site_url, \"\/wp-json\/wp\/v2\/posts\")\n    try:\n        response = requests.get(posts_url, timeout=10)\n        if response.status_code == 200:\n            posts_data = response.json()\n            print(f\"\u2705 \u6587\u7ae0API\u6d4b\u8bd5\u6210\u529f!\")\n            print(f\"   \u627e\u5230 {len(posts_data)} \u7bc7\u6587\u7ae0\")\n            \n            if posts_data:\n                # \u663e\u793a\u524d3\u7bc7\u6587\u7ae0\u7684\u6807\u9898\n                for i, post in enumerate(posts_data&#91;:3]):\n                    title = post.get('title', {}).get('rendered', '\u65e0\u6807\u9898')\n                    # \u6e05\u7406HTML\u6807\u7b7e\n                    import re\n                    title = re.sub(r'&lt;&#91;^&gt;]+&gt;', '', title)\n                    print(f\"   {i+1}. {title}\")\n            return True\n        else:\n            print(f\"\u274c \u6587\u7ae0API\u8fd4\u56de\u72b6\u6001\u7801: {response.status_code}\")\n            return False\n    except requests.exceptions.RequestException as e:\n        print(f\"\u274c \u6587\u7ae0API\u6d4b\u8bd5\u5931\u8d25: {e}\")\n        return False\n\ndef test_api_details(site_url):\n    \"\"\"\u6d4b\u8bd5API\u8be6\u7ec6\u4fe1\u606f\"\"\"\n    print(f\"\\n\ud83d\udcca \u83b7\u53d6API\u8be6\u7ec6\u4fe1\u606f...\")\n    \n    api_url = urljoin(site_url, \"\/wp-json\/\")\n    try:\n        response = requests.get(api_url, timeout=10)\n        data = response.json()\n        \n        print(\"\u2705 API\u4fe1\u606f:\")\n        print(f\"   \u540d\u79f0: {data.get('name', '\u672a\u77e5')}\")\n        print(f\"   \u63cf\u8ff0: {data.get('description', '\u672a\u77e5')}\")\n        print(f\"   \u4e3b\u9875: {data.get('home', '\u672a\u77e5')}\")\n        \n        # \u68c0\u67e5\u53ef\u7528\u7684\u7aef\u70b9\n        routes = data.get('routes', {})\n        print(f\"   \u53ef\u7528\u7aef\u70b9: {len(routes)} \u4e2a\")\n        \n        # \u663e\u793a\u91cd\u8981\u7684\u7aef\u70b9\n        important_routes = &#91;route for route in routes.keys() if any(keyword in route for keyword in &#91;'posts', 'pages', 'media'])]\n        for route in important_routes&#91;:5]:  # \u663e\u793a\u524d5\u4e2a\u91cd\u8981\u7aef\u70b9\n            print(f\"     - {route}\")\n            \n        return True\n    except Exception as e:\n        print(f\"\u274c \u83b7\u53d6API\u4fe1\u606f\u5931\u8d25: {e}\")\n        return False\n\ndef main():\n    if len(sys.argv) != 2:\n        print(\"\u4f7f\u7528\u65b9\u6cd5: python test_wp_api.py &lt;\u7f51\u7ad9\u5730\u5740&gt;\")\n        print(\"\u793a\u4f8b: python test_wp_api.py https:\/\/example.com\")\n        sys.exit(1)\n    \n    site_url = sys.argv&#91;1]\n    \n    print(\"=\" * 50)\n    print(\"\ud83e\uddea WordPress REST API \u6d4b\u8bd5\u5de5\u5177\")\n    print(\"=\" * 50)\n    \n    # \u8fd0\u884c\u6d4b\u8bd5\n    tests_passed = 0\n    tests_total = 3\n    \n    if test_basic_api(site_url):\n        tests_passed += 1\n    \n    if test_posts_api(site_url):\n        tests_passed += 1\n    \n    if test_api_details(site_url):\n        tests_passed += 1\n    \n    # \u6d4b\u8bd5\u603b\u7ed3\n    print(\"\\n\" + \"=\" * 50)\n    print(\"\ud83d\udccb \u6d4b\u8bd5\u603b\u7ed3\")\n    print(\"=\" * 50)\n    print(f\"\u901a\u8fc7\u6d4b\u8bd5: {tests_passed}\/{tests_total}\")\n    \n    if tests_passed == tests_total:\n        print(\"\ud83c\udf89 \u6240\u6709\u6d4b\u8bd5\u901a\u8fc7\uff01\u53ef\u4ee5\u5f00\u59cb\u63d0\u53d6\u5185\u5bb9\")\n        return True\n    else:\n        print(\"\u274c \u90e8\u5206\u6d4b\u8bd5\u5931\u8d25\uff0c\u8bf7\u68c0\u67e5WordPress REST API\u914d\u7f6e\")\n        return False\n\nif __name__ == \"__main__\":\n    main()<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>\u5185\u5bb9\u63d0\u53d6\u811a\u672c<\/strong>&nbsp;<code>extract_content.py<\/code><\/h3>\n\n\n\n<p>\u8fd9\u4e2a\u811a\u672c\u4f7f\u7528corpress\u5305\u6765\u63d0\u53d6\u5185\u5bb9<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># extract_content.py\nimport sys\nimport os\nimport json\nfrom datetime import datetime\nfrom corpress.core import corpress\n\ndef create_output_structure(base_path):\n    \"\"\"\u521b\u5efa\u8f93\u51fa\u76ee\u5f55\u7ed3\u6784\"\"\"\n    timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n    output_dir = os.path.join(base_path, f\"wordpress_export_{timestamp}\")\n    \n    os.makedirs(output_dir, exist_ok=True)\n    os.makedirs(os.path.join(output_dir, \"raw_data\"), exist_ok=True)\n    os.makedirs(os.path.join(output_dir, \"formatted\"), exist_ok=True)\n    \n    print(f\"\ud83d\udcc1 \u521b\u5efa\u8f93\u51fa\u76ee\u5f55: {output_dir}\")\n    return output_dir\n\ndef extract_with_corpress(site_url, output_dir):\n    \"\"\"\u4f7f\u7528corpress\u63d0\u53d6\u5185\u5bb9\"\"\"\n    print(f\"\\n\ud83d\ude80 \u5f00\u59cb\u63d0\u53d6\u5185\u5bb9...\")\n    print(f\"   \u6765\u6e90: {site_url}\")\n    print(f\"   \u76ee\u6807: {output_dir}\")\n    \n    try:\n        # \u63d0\u53d6\u5185\u5bb9\u5230txt\u683c\u5f0f\n        print(\"\ud83d\udcc4 \u63d0\u53d6\u6587\u672c\u683c\u5f0f\u5185\u5bb9...\")\n        corpress(site_url, save_path=output_dir, corpus_format='txt')\n        \n        # \u63d0\u53d6\u5185\u5bb9\u5230csv\u683c\u5f0f\uff08\u5305\u542b\u5143\u6570\u636e\uff09\n        print(\"\ud83d\udcca \u63d0\u53d6CSV\u683c\u5f0f\u5185\u5bb9\uff08\u5305\u542b\u5143\u6570\u636e\uff09...\")\n        corpress(site_url, save_path=output_dir, corpus_format='csv')\n        \n        print(\"\u2705 \u5185\u5bb9\u63d0\u53d6\u5b8c\u6210!\")\n        return True\n        \n    except Exception as e:\n        print(f\"\u274c \u63d0\u53d6\u5931\u8d25: {e}\")\n        return False\n\ndef analyze_extracted_content(output_dir):\n    \"\"\"\u5206\u6790\u63d0\u53d6\u7684\u5185\u5bb9\"\"\"\n    print(f\"\\n\ud83d\udcca \u5206\u6790\u63d0\u53d6\u7684\u5185\u5bb9...\")\n    \n    # \u7edf\u8ba1\u6587\u4ef6\n    txt_files = &#91;f for f in os.listdir(output_dir) if f.endswith('.txt')]\n    csv_files = &#91;f for f in os.listdir(output_dir) if f.endswith('.csv')]\n    \n    print(f\"   \u6587\u672c\u6587\u4ef6: {len(txt_files)} \u4e2a\")\n    print(f\"   CSV\u6587\u4ef6: {len(csv_files)} \u4e2a\")\n    \n    # \u663e\u793a\u90e8\u5206\u63d0\u53d6\u7684\u6587\u4ef6\n    if txt_files:\n        print(f\"\\n   \u793a\u4f8b\u6587\u672c\u6587\u4ef6:\")\n        for file in txt_files&#91;:3]:  # \u663e\u793a\u524d3\u4e2a\u6587\u4ef6\n            file_path = os.path.join(output_dir, file)\n            with open(file_path, 'r', encoding='utf-8') as f:\n                content = f.read()\n                preview = content&#91;:100] + \"...\" if len(content) &gt; 100 else content\n                print(f\"     - {file}: {preview}\")\n    \n    return len(txt_files) &gt; 0\n\ndef main():\n    if len(sys.argv) != 2:\n        print(\"\u4f7f\u7528\u65b9\u6cd5: python extract_content.py &lt;\u7f51\u7ad9\u5730\u5740&gt;\")\n        print(\"\u793a\u4f8b: python extract_content.py https:\/\/example.com\")\n        sys.exit(1)\n    \n    site_url = sys.argv&#91;1]\n    base_output_path = \".\/wordpress_exports\"\n    \n    print(\"=\" * 50)\n    print(\"\ud83d\udce5 WordPress\u5185\u5bb9\u63d0\u53d6\u5de5\u5177\")\n    print(\"=\" * 50)\n    \n    # \u521b\u5efa\u8f93\u51fa\u76ee\u5f55\n    output_dir = create_output_structure(base_output_path)\n    \n    # \u63d0\u53d6\u5185\u5bb9\n    if extract_with_corpress(site_url, output_dir):\n        # \u5206\u6790\u7ed3\u679c\n        if analyze_extracted_content(output_dir):\n            print(f\"\\n\ud83c\udf89 \u5185\u5bb9\u63d0\u53d6\u6210\u529f\u5b8c\u6210!\")\n            print(f\"   \u8f93\u51fa\u76ee\u5f55: {output_dir}\")\n            return True\n        else:\n            print(\"\u274c \u5185\u5bb9\u63d0\u53d6\u4f46\u672a\u627e\u5230\u6709\u6548\u6587\u4ef6\")\n            return False\n    else:\n        print(\"\u274c \u5185\u5bb9\u63d0\u53d6\u5931\u8d25\")\n        return False\n\nif __name__ == \"__main__\":\n    main()<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>\u5185\u5bb9\u683c\u5f0f\u5316\u811a\u672c<\/strong>&nbsp;<code>format_content.py<\/code><\/h3>\n\n\n\n<p>\u8fd9\u4e2a\u811a\u672c\u5bf9\u63d0\u53d6\u7684\u5185\u5bb9\u8fdb\u884c\u8fdb\u4e00\u6b65\u5904\u7406\u548c\u683c\u5f0f\u5316<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># format_content.py\nimport os\nimport json\nimport csv\nimport re\nfrom pathlib import Path\n\ndef clean_html_tags(text):\n    \"\"\"\u6e05\u7406HTML\u6807\u7b7e\"\"\"\n    if not text:\n        return \"\"\n    # \u79fb\u9664HTML\u6807\u7b7e\n    clean = re.compile('&lt;.*?&gt;')\n    return re.sub(clean, '', text)\n\ndef format_for_ai(content):\n    \"\"\"\u683c\u5f0f\u5316\u5185\u5bb9\uff0c\u4f7f\u5176\u66f4\u9002\u5408AI\u5904\u7406\"\"\"\n    # \u6e05\u7406HTML\n    clean_content = clean_html_tags(content)\n    \n    # \u79fb\u9664\u591a\u4f59\u7684\u7a7a\u767d\u5b57\u7b26\n    clean_content = re.sub(r'\\n\\s*\\n', '\\n\\n', clean_content)\n    clean_content = re.sub(r'&#91; \\t]+', ' ', clean_content)\n    \n    # \u786e\u4fdd\u6bcf\u6bb5\u4ee5\u6362\u884c\u7b26\u7ed3\u675f\n    clean_content = clean_content.strip()\n    \n    return clean_content\n\ndef process_text_files(input_dir, output_dir):\n    \"\"\"\u5904\u7406\u6587\u672c\u6587\u4ef6\"\"\"\n    print(\"\ud83d\udcc4 \u5904\u7406\u6587\u672c\u6587\u4ef6...\")\n    \n    txt_files = &#91;f for f in os.listdir(input_dir) if f.endswith('.txt')]\n    processed_count = 0\n    \n    for txt_file in txt_files:\n        input_path = os.path.join(input_dir, txt_file)\n        output_path = os.path.join(output_dir, f\"cleaned_{txt_file}\")\n        \n        try:\n            with open(input_path, 'r', encoding='utf-8') as f:\n                content = f.read()\n            \n            # \u683c\u5f0f\u5316\u5185\u5bb9\n            formatted_content = format_for_ai(content)\n            \n            # \u4fdd\u5b58\u683c\u5f0f\u5316\u540e\u7684\u5185\u5bb9\n            with open(output_path, 'w', encoding='utf-8') as f:\n                f.write(formatted_content)\n            \n            processed_count += 1\n            \n        except Exception as e:\n            print(f\"\u274c \u5904\u7406\u6587\u4ef6 {txt_file} \u65f6\u51fa\u9519: {e}\")\n    \n    print(f\"\u2705 \u5df2\u5904\u7406 {processed_count}\/{len(txt_files)} \u4e2a\u6587\u672c\u6587\u4ef6\")\n    return processed_count\n\ndef process_csv_files(input_dir, output_dir):\n    \"\"\"\u5904\u7406CSV\u6587\u4ef6\"\"\"\n    print(\"\ud83d\udcca \u5904\u7406CSV\u6587\u4ef6...\")\n    \n    csv_files = &#91;f for f in os.listdir(input_dir) if f.endswith('.csv')]\n    \n    for csv_file in csv_files:\n        input_path = os.path.join(input_dir, csv_file)\n        output_path = os.path.join(output_dir, f\"summary_{csv_file}\")\n        \n        try:\n            with open(input_path, 'r', encoding='utf-8') as f:\n                reader = csv.DictReader(f)\n                rows = list(reader)\n            \n            print(f\"   {csv_file}: {len(rows)} \u6761\u8bb0\u5f55\")\n            \n            # \u521b\u5efa\u7b80\u5355\u7684\u7edf\u8ba1\u4fe1\u606f\n            stats = {\n                \"total_records\": len(rows),\n                \"columns\": list(rows&#91;0].keys()) if rows else &#91;],\n                \"sample_record\": rows&#91;0] if rows else {}\n            }\n            \n            # \u4fdd\u5b58\u7edf\u8ba1\u4fe1\u606f\n            stats_path = os.path.join(output_dir, f\"stats_{os.path.splitext(csv_file)&#91;0]}.json\")\n            with open(stats_path, 'w', encoding='utf-8') as f:\n                json.dump(stats, f, ensure_ascii=False, indent=2)\n            \n        except Exception as e:\n            print(f\"\u274c \u5904\u7406CSV\u6587\u4ef6 {csv_file} \u65f6\u51fa\u9519: {e}\")\n    \n    return len(csv_files)\n\ndef create_content_summary(input_dir, output_dir):\n    \"\"\"\u521b\u5efa\u5185\u5bb9\u6458\u8981\"\"\"\n    print(\"\\n\ud83d\udccb \u521b\u5efa\u5185\u5bb9\u6458\u8981...\")\n    \n    summary = {\n        \"extraction_date\": str(Path(input_dir).name),\n        \"files\": {},\n        \"total_content_size\": 0\n    }\n    \n    # \u7edf\u8ba1\u6240\u6709\u6587\u4ef6\n    for root, dirs, files in os.walk(input_dir):\n        for file in files:\n            if file.endswith(('.txt', '.csv')):\n                file_path = os.path.join(root, file)\n                file_size = os.path.getsize(file_path)\n                \n                rel_path = os.path.relpath(file_path, input_dir)\n                summary&#91;\"files\"]&#91;rel_path] = {\n                    \"size_bytes\": file_size,\n                    \"size_kb\": round(file_size \/ 1024, 2)\n                }\n                summary&#91;\"total_content_size\"] += file_size\n    \n    # \u4fdd\u5b58\u6458\u8981\n    summary_path = os.path.join(output_dir, \"extraction_summary.json\")\n    with open(summary_path, 'w', encoding='utf-8') as f:\n        json.dump(summary, f, ensure_ascii=False, indent=2)\n    \n    print(f\"\u2705 \u5185\u5bb9\u6458\u8981\u5df2\u4fdd\u5b58\")\n    print(f\"   \u603b\u5185\u5bb9\u5927\u5c0f: {round(summary&#91;'total_content_size'] \/ 1024, 2)} KB\")\n    print(f\"   \u6587\u4ef6\u6570\u91cf: {len(summary&#91;'files'])}\")\n    \n    return summary\n\ndef main():\n    if len(sys.argv) != 2:\n        print(\"\u4f7f\u7528\u65b9\u6cd5: python format_content.py &lt;\u63d0\u53d6\u5185\u5bb9\u76ee\u5f55&gt;\")\n        print(\"\u793a\u4f8b: python format_content.py .\/wordpress_exports\/wordpress_export_20240520_143022\")\n        sys.exit(1)\n    \n    input_dir = sys.argv&#91;1]\n    \n    if not os.path.exists(input_dir):\n        print(f\"\u274c \u76ee\u5f55\u4e0d\u5b58\u5728: {input_dir}\")\n        sys.exit(1)\n    \n    print(\"=\" * 50)\n    print(\"\u2728 \u5185\u5bb9\u683c\u5f0f\u5316\u5de5\u5177\")\n    print(\"=\" * 50)\n    \n    # \u521b\u5efa\u683c\u5f0f\u5316\u8f93\u51fa\u76ee\u5f55\n    formatted_dir = os.path.join(input_dir, \"formatted\")\n    os.makedirs(formatted_dir, exist_ok=True)\n    \n    # \u5904\u7406\u5404\u79cd\u6587\u4ef6\n    txt_count = process_text_files(input_dir, formatted_dir)\n    csv_count = process_csv_files(input_dir, formatted_dir)\n    \n    # \u521b\u5efa\u6458\u8981\n    summary = create_content_summary(input_dir, formatted_dir)\n    \n    print(f\"\\n\ud83c\udf89 \u5185\u5bb9\u683c\u5f0f\u5316\u5b8c\u6210!\")\n    print(f\"   \u683c\u5f0f\u5316\u6587\u4ef6\u4fdd\u5b58\u5728: {formatted_dir}\")\n    print(f\"   \u5904\u7406\u6587\u672c\u6587\u4ef6: {txt_count} \u4e2a\")\n    print(f\"   \u5904\u7406CSV\u6587\u4ef6: {csv_count} \u4e2a\")\n\nif __name__ == \"__main__\":\n    main()<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>\u63d0\u53d6\u7ed3\u679c\u9a8c\u8bc1\u811a\u672c<\/strong>&nbsp;<code>validate_extraction.py<\/code><\/h3>\n\n\n\n<p>\u8fd9\u4e2a\u811a\u672c\u4e13\u95e8\u9a8c\u8bc1\u63d0\u53d6\u7ed3\u679c\u7684\u8d28\u91cf\u548c\u5b8c\u6574\u6027\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># validate_extraction.py\nimport os\nimport json\nimport sys\nfrom pathlib import Path\n\ndef validate_directory_structure(extract_dir):\n    \"\"\"\u9a8c\u8bc1\u76ee\u5f55\u7ed3\u6784\u662f\u5426\u7b26\u5408\u9884\u671f\"\"\"\n    print(\"\ud83d\udcc1 \u9a8c\u8bc1\u76ee\u5f55\u7ed3\u6784...\")\n    \n    expected_dirs = &#91;'raw_data', 'formatted']\n    expected_files = &#91;'extraction_summary.json']\n    \n    issues = &#91;]\n    \n    # \u68c0\u67e5\u57fa\u672c\u76ee\u5f55\u662f\u5426\u5b58\u5728\n    if not os.path.exists(extract_dir):\n        issues.append(f\"\u274c \u63d0\u53d6\u76ee\u5f55\u4e0d\u5b58\u5728: {extract_dir}\")\n        return False, issues\n    \n    # \u68c0\u67e5\u662f\u5426\u6709\u5fc5\u8981\u7684\u6587\u4ef6\n    has_txt_files = any(f.endswith('.txt') for f in os.listdir(extract_dir))\n    has_csv_files = any(f.endswith('.csv') for f in os.listdir(extract_dir))\n    \n    if not has_txt_files and not has_csv_files:\n        issues.append(\"\u274c \u672a\u627e\u5230\u4efb\u4f55\u63d0\u53d6\u7684\u6587\u672c\u6216CSV\u6587\u4ef6\")\n    \n    # \u68c0\u67e5\u6587\u4ef6\u7f16\u7801\u548c\u53ef\u8bfb\u6027\n    file_issues = check_file_integrity(extract_dir)\n    issues.extend(file_issues)\n    \n    if not issues:\n        print(\"\u2705 \u76ee\u5f55\u7ed3\u6784\u9a8c\u8bc1\u901a\u8fc7\")\n        return True, issues\n    else:\n        return False, issues\n\ndef check_file_integrity(extract_dir):\n    \"\"\"\u68c0\u67e5\u6587\u4ef6\u5b8c\u6574\u6027\u548c\u53ef\u8bfb\u6027\"\"\"\n    print(\"\ud83d\udd0d \u68c0\u67e5\u6587\u4ef6\u5b8c\u6574\u6027...\")\n    \n    issues = &#91;]\n    \n    for file in os.listdir(extract_dir):\n        if file.endswith(('.txt', '.csv')):\n            file_path = os.path.join(extract_dir, file)\n            \n            try:\n                # \u5c1d\u8bd5\u8bfb\u53d6\u6587\u4ef6\n                with open(file_path, 'r', encoding='utf-8') as f:\n                    content = f.read()\n                \n                # \u68c0\u67e5\u6587\u4ef6\u5927\u5c0f\n                file_size = len(content)\n                if file_size == 0:\n                    issues.append(f\"\u274c \u6587\u4ef6\u4e3a\u7a7a: {file}\")\n                elif file_size &lt; 10:  # \u592a\u5c0f\u53ef\u80fd\u6709\u95ee\u9898\n                    issues.append(f\"\u26a0\ufe0f  \u6587\u4ef6\u8fc7\u5c0f({file_size}\u5b57\u8282): {file}\")\n                \n                # \u68c0\u67e5\u5185\u5bb9\u8d28\u91cf\n                if file.endswith('.txt'):\n                    content_issues = check_text_content(content, file)\n                    issues.extend(content_issues)\n                    \n            except UnicodeDecodeError:\n                issues.append(f\"\u274c \u6587\u4ef6\u7f16\u7801\u95ee\u9898: {file}\")\n            except Exception as e:\n                issues.append(f\"\u274c \u8bfb\u53d6\u6587\u4ef6\u5931\u8d25 {file}: {e}\")\n    \n    return issues\n\ndef check_text_content(content, filename):\n    \"\"\"\u68c0\u67e5\u6587\u672c\u5185\u5bb9\u8d28\u91cf\"\"\"\n    issues = &#91;]\n    \n    # \u68c0\u67e5\u662f\u5426\u5305\u542b\u6709\u7528\u7684\u5185\u5bb9\uff08\u4e0d\u662f\u53ea\u6709HTML\u6807\u7b7e\u6216\u7a7a\u767d\uff09\n    text_only = remove_html_tags(content).strip()\n    \n    if len(text_only) &lt; 50 and len(content) &gt; 100:\n        issues.append(f\"\u26a0\ufe0f  \u53ef\u80fd\u5305\u542b\u5927\u91cfHTML\u6807\u7b7e: {filename}\")\n    \n    # \u68c0\u67e5\u662f\u5426\u6709\u660e\u663e\u7684\u63d0\u53d6\u95ee\u9898\n    error_indicators = &#91;\n        \"undefined\",\n        \"error\",\n        \"exception\",\n        \"cannot read property\",\n        \"wp-json\",\n        \"rest_api\"\n    ]\n    \n    for indicator in error_indicators:\n        if indicator.lower() in content.lower():\n            issues.append(f\"\u26a0\ufe0f  \u53ef\u80fd\u5305\u542b\u9519\u8bef\u4fe1\u606f({indicator}): {filename}\")\n    \n    return issues\n\ndef remove_html_tags(text):\n    \"\"\"\u79fb\u9664HTML\u6807\u7b7e\"\"\"\n    import re\n    clean = re.compile('&lt;.*?&gt;')\n    return re.sub(clean, '', text)\n\ndef validate_content_structure(extract_dir):\n    \"\"\"\u9a8c\u8bc1\u5185\u5bb9\u7ed3\u6784\"\"\"\n    print(\"\ud83d\udcca \u9a8c\u8bc1\u5185\u5bb9\u7ed3\u6784...\")\n    \n    issues = &#91;]\n    \n    # \u7edf\u8ba1\u6587\u4ef6\u7c7b\u578b\u548c\u6570\u91cf\n    file_types = {}\n    total_files = 0\n    \n    for file in os.listdir(extract_dir):\n        ext = Path(file).suffix.lower()\n        file_types&#91;ext] = file_types.get(ext, 0) + 1\n        total_files += 1\n    \n    print(f\"   \u6587\u4ef6\u7edf\u8ba1: {file_types}\")\n    \n    if total_files == 0:\n        issues.append(\"\u274c \u6ca1\u6709\u627e\u5230\u4efb\u4f55\u63d0\u53d6\u7684\u6587\u4ef6\")\n    \n    # \u68c0\u67e5\u662f\u5426\u6709\u8db3\u591f\u7684\u5185\u5bb9\n    txt_count = file_types.get('.txt', 0)\n    csv_count = file_types.get('.csv', 0)\n    \n    if txt_count == 0 and csv_count == 0:\n        issues.append(\"\u274c \u6ca1\u6709\u627e\u5230\u6587\u672c\u6216CSV\u6587\u4ef6\")\n    \n    return issues\n\ndef generate_validation_report(extract_dir, is_valid, issues):\n    \"\"\"\u751f\u6210\u9a8c\u8bc1\u62a5\u544a\"\"\"\n    report = {\n        \"validation_timestamp\": str(Path(extract_dir).name),\n        \"extract_directory\": extract_dir,\n        \"is_valid\": is_valid,\n        \"issues_found\": len(issues),\n        \"issues\": issues,\n        \"recommendation\": \"\"\n    }\n    \n    if is_valid:\n        report&#91;\"recommendation\"] = \"\u2705 \u9a8c\u8bc1\u901a\u8fc7\uff0c\u53ef\u4ee5\u8fd0\u884c\u683c\u5f0f\u5316\u811a\u672c\"\n    else:\n        report&#91;\"recommendation\"] = \"\u274c \u8bf7\u5148\u89e3\u51b3\u4e0a\u8ff0\u95ee\u9898\u518d\u8fd0\u884c\u683c\u5f0f\u5316\u811a\u672c\"\n    \n    # \u4fdd\u5b58\u62a5\u544a\n    report_path = os.path.join(extract_dir, \"validation_report.json\")\n    with open(report_path, 'w', encoding='utf-8') as f:\n        json.dump(report, f, ensure_ascii=False, indent=2)\n    \n    return report\n\ndef main():\n    if len(sys.argv) != 2:\n        print(\"\u4f7f\u7528\u65b9\u6cd5: python validate_extraction.py &lt;\u63d0\u53d6\u5185\u5bb9\u76ee\u5f55&gt;\")\n        print(\"\u793a\u4f8b: python validate_extraction.py .\/wordpress_exports\/wordpress_export_20240520_143022\")\n        sys.exit(1)\n    \n    extract_dir = sys.argv&#91;1]\n    \n    print(\"=\" * 50)\n    print(\"\ud83d\udd0d \u63d0\u53d6\u7ed3\u679c\u9a8c\u8bc1\u5de5\u5177\")\n    print(\"=\" * 50)\n    \n    # \u8fd0\u884c\u9a8c\u8bc1\n    is_valid, structure_issues = validate_directory_structure(extract_dir)\n    content_issues = validate_content_structure(extract_dir)\n    \n    all_issues = structure_issues + content_issues\n    is_overall_valid = is_valid and len(all_issues) == 0\n    \n    # \u751f\u6210\u62a5\u544a\n    report = generate_validation_report(extract_dir, is_overall_valid, all_issues)\n    \n    # \u663e\u793a\u7ed3\u679c\n    print(\"\\n\" + \"=\" * 50)\n    print(\"\ud83d\udccb \u9a8c\u8bc1\u7ed3\u679c\")\n    print(\"=\" * 50)\n    \n    if is_overall_valid:\n        print(\"\ud83c\udf89 \u6240\u6709\u9a8c\u8bc1\u901a\u8fc7\uff01\")\n    else:\n        print(f\"\u274c \u53d1\u73b0 {len(all_issues)} \u4e2a\u95ee\u9898:\")\n        for issue in all_issues:\n            print(f\"   {issue}\")\n    \n    print(f\"\\n\ud83d\udca1 \u5efa\u8bae: {report&#91;'recommendation']}\")\n    print(f\"\ud83d\udcc4 \u8be6\u7ec6\u62a5\u544a\u5df2\u4fdd\u5b58: {os.path.join(extract_dir, 'validation_report.json')}\")\n    \n    return is_overall_valid\n\nif __name__ == \"__main__\":\n    success = main()\n    sys.exit(0 if success else 1)<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>\u96c6\u6210\u6d4b\u8bd5\u811a\u672c<\/strong>&nbsp;<code>run_integration_test.py<\/code><\/h3>\n\n\n\n<p>\u8fd9\u4e2a\u811a\u672c\u6a21\u62df\u5b8c\u6574\u7684\u5de5\u4f5c\u6d41\u5e76\u9a8c\u8bc1\u6bcf\u4e2a\u6b65\u9aa4<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># run_integration_test.py\nimport os\nimport sys\nimport subprocess\nimport tempfile\nimport shutil\nfrom pathlib import Path\n\ndef run_command(command, description):\n    \"\"\"\u8fd0\u884c\u547d\u4ee4\u5e76\u68c0\u67e5\u7ed3\u679c\"\"\"\n    print(f\"\\n\ud83d\udd27 {description}\")\n    print(f\"   \u6267\u884c: {command}\")\n    \n    try:\n        result = subprocess.run(command, shell=True, capture_output=True, text=True)\n        \n        if result.returncode == 0:\n            print(\"\u2705 \u6267\u884c\u6210\u529f\")\n            return True, result.stdout\n        else:\n            print(f\"\u274c \u6267\u884c\u5931\u8d25 (\u9000\u51fa\u7801: {result.returncode})\")\n            print(f\"   \u9519\u8bef\u8f93\u51fa: {result.stderr}\")\n            return False, result.stderr\n    except Exception as e:\n        print(f\"\u274c \u6267\u884c\u5f02\u5e38: {e}\")\n        return False, str(e)\n\ndef test_environment():\n    \"\"\"\u6d4b\u8bd5\u73af\u5883\u914d\u7f6e\"\"\"\n    print(\"=\" * 50)\n    print(\"1. \u6d4b\u8bd5\u73af\u5883\u914d\u7f6e\")\n    print(\"=\" * 50)\n    \n    return run_command(\"python check_environment.py\", \"\u68c0\u67e5Python\u73af\u5883\")\n\ndef test_api_connection(test_site):\n    \"\"\"\u6d4b\u8bd5API\u8fde\u63a5\"\"\"\n    print(\"\\n\" + \"=\" * 50)\n    print(\"2. \u6d4b\u8bd5API\u8fde\u63a5\")\n    print(\"=\" * 50)\n    \n    return run_command(f\"python test_wp_api.py {test_site}\", \"\u6d4b\u8bd5WordPress API\u8fde\u63a5\")\n\ndef create_test_extraction(test_site, temp_dir):\n    \"\"\"\u521b\u5efa\u6d4b\u8bd5\u63d0\u53d6\"\"\"\n    print(\"\\n\" + \"=\" * 50)\n    print(\"3. \u6d4b\u8bd5\u5185\u5bb9\u63d0\u53d6\")\n    print(\"=\" * 50)\n    \n    # \u521b\u5efa\u4e00\u4e2a\u5c0f\u7684\u6d4b\u8bd5\u63d0\u53d6\n    success, output = run_command(\n        f\"python extract_content.py {test_site}\", \n        \"\u63d0\u53d6WordPress\u5185\u5bb9\"\n    )\n    \n    if success:\n        # \u627e\u5230\u6700\u65b0\u7684\u63d0\u53d6\u76ee\u5f55\n        export_dir = find_latest_export_dir()\n        return success, export_dir\n    else:\n        return success, None\n\ndef find_latest_export_dir():\n    \"\"\"\u627e\u5230\u6700\u65b0\u7684\u5bfc\u51fa\u76ee\u5f55\"\"\"\n    base_dir = \".\/wordpress_exports\"\n    if not os.path.exists(base_dir):\n        return None\n    \n    dirs = &#91;d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]\n    if not dirs:\n        return None\n    \n    # \u6309\u65f6\u95f4\u6392\u5e8f\u627e\u5230\u6700\u65b0\u7684\n    latest_dir = sorted(dirs)&#91;-1]\n    return os.path.join(base_dir, latest_dir)\n\ndef validate_extraction(extract_dir):\n    \"\"\"\u9a8c\u8bc1\u63d0\u53d6\u7ed3\u679c\"\"\"\n    print(\"\\n\" + \"=\" * 50)\n    print(\"4. \u9a8c\u8bc1\u63d0\u53d6\u7ed3\u679c\")\n    print(\"=\" * 50)\n    \n    if not extract_dir or not os.path.exists(extract_dir):\n        print(\"\u274c \u6ca1\u6709\u627e\u5230\u63d0\u53d6\u76ee\u5f55\")\n        return False, None\n    \n    return run_command(\n        f\"python validate_extraction.py {extract_dir}\", \n        \"\u9a8c\u8bc1\u63d0\u53d6\u5185\u5bb9\u8d28\u91cf\"\n    )\n\ndef test_formatting(extract_dir):\n    \"\"\"\u6d4b\u8bd5\u683c\u5f0f\u5316\u8fc7\u7a0b\"\"\"\n    print(\"\\n\" + \"=\" * 50)\n    print(\"5. \u6d4b\u8bd5\u5185\u5bb9\u683c\u5f0f\u5316\")\n    print(\"=\" * 50)\n    \n    if not extract_dir:\n        return False, \"\u6ca1\u6709\u6709\u6548\u7684\u63d0\u53d6\u76ee\u5f55\"\n    \n    return run_command(\n        f\"python format_content.py {extract_dir}\", \n        \"\u683c\u5f0f\u5316\u63d0\u53d6\u7684\u5185\u5bb9\"\n    )\n\ndef generate_test_report(test_results, temp_dir):\n    \"\"\"\u751f\u6210\u6d4b\u8bd5\u62a5\u544a\"\"\"\n    report = {\n        \"test_timestamp\": str(Path(temp_dir).name),\n        \"overall_success\": all(result&#91;0] for result in test_results.values()),\n        \"detailed_results\": test_results,\n        \"recommendations\": &#91;]\n    }\n    \n    # \u6839\u636e\u6d4b\u8bd5\u7ed3\u679c\u751f\u6210\u5efa\u8bae\n    if not test_results&#91;\"environment\"]&#91;0]:\n        report&#91;\"recommendations\"].append(\"\u274c \u8bf7\u5148\u4fee\u590d\u73af\u5883\u914d\u7f6e\u95ee\u9898\")\n    if not test_results&#91;\"api\"]&#91;0]:\n        report&#91;\"recommendations\"].append(\"\u274c \u8bf7\u68c0\u67e5WordPress REST API\u914d\u7f6e\")\n    if not test_results&#91;\"extraction\"]&#91;0]:\n        report&#91;\"recommendations\"].append(\"\u274c \u5185\u5bb9\u63d0\u53d6\u5931\u8d25\uff0c\u8bf7\u68c0\u67e5\u7f51\u7ad9\u53ef\u8bbf\u95ee\u6027\")\n    if not test_results&#91;\"validation\"]&#91;0]:\n        report&#91;\"recommendations\"].append(\"\u26a0\ufe0f  \u63d0\u53d6\u5185\u5bb9\u6709\u8d28\u91cf\u95ee\u9898\uff0c\u8bf7\u68c0\u67e5\u9a8c\u8bc1\u62a5\u544a\")\n    if not test_results&#91;\"formatting\"]&#91;0]:\n        report&#91;\"recommendations\"].append(\"\u26a0\ufe0f  \u683c\u5f0f\u5316\u8fc7\u7a0b\u6709\u95ee\u9898\uff0c\u8bf7\u68c0\u67e5\u63d0\u53d6\u5185\u5bb9\u683c\u5f0f\")\n    \n    if report&#91;\"overall_success\"]:\n        report&#91;\"recommendations\"].append(\"\ud83c\udf89 \u6240\u6709\u6d4b\u8bd5\u901a\u8fc7\uff01\u5de5\u4f5c\u6d41\u53ef\u4ee5\u6b63\u5e38\u8fd0\u884c\")\n    \n    # \u4fdd\u5b58\u62a5\u544a\n    report_path = os.path.join(temp_dir, \"integration_test_report.json\")\n    with open(report_path, 'w', encoding='utf-8') as f:\n        json.dump(report, f, ensure_ascii=False, indent=2)\n    \n    return report\n\ndef main():\n    if len(sys.argv) != 2:\n        print(\"\u4f7f\u7528\u65b9\u6cd5: python run_integration_test.py &lt;\u6d4b\u8bd5\u7f51\u7ad9\u5730\u5740&gt;\")\n        print(\"\u793a\u4f8b: python run_integration_test.py https:\/\/example.com\")\n        sys.exit(1)\n    \n    test_site = sys.argv&#91;1]\n    \n    # \u521b\u5efa\u4e34\u65f6\u76ee\u5f55\u7528\u4e8e\u6d4b\u8bd5\n    with tempfile.TemporaryDirectory() as temp_dir:\n        print(\"=\" * 50)\n        print(\"\ud83e\uddea WordPress\u5185\u5bb9\u63d0\u53d6\u96c6\u6210\u6d4b\u8bd5\")\n        print(\"=\" * 50)\n        print(f\"\u6d4b\u8bd5\u7f51\u7ad9: {test_site}\")\n        print(f\"\u5de5\u4f5c\u76ee\u5f55: {temp_dir}\")\n        \n        # \u4fdd\u5b58\u539f\u59cb\u5de5\u4f5c\u76ee\u5f55\n        original_dir = os.getcwd()\n        \n        try:\n            # \u5207\u6362\u5230\u4e34\u65f6\u76ee\u5f55\n            os.chdir(temp_dir)\n            \n            # \u590d\u5236\u811a\u672c\u5230\u4e34\u65f6\u76ee\u5f55\uff08\u5728\u5b9e\u9645\u4f7f\u7528\u4e2d\uff0c\u8fd9\u4e9b\u811a\u672c\u5e94\u8be5\u5728\u5f53\u524d\u76ee\u5f55\uff09\n            # \u8fd9\u91cc\u5047\u8bbe\u811a\u672c\u5df2\u7ecf\u5728\u5f53\u524d\u76ee\u5f55\u53ef\u7528\n            \n            # \u8fd0\u884c\u5404\u4e2a\u6d4b\u8bd5\u6b65\u9aa4\n            test_results = {}\n            \n            test_results&#91;\"environment\"] = test_environment()\n            test_results&#91;\"api\"] = test_api_connection(test_site)\n            test_results&#91;\"extraction\"], extract_dir = create_test_extraction(test_site, temp_dir)\n            test_results&#91;\"validation\"] = validate_extraction(extract_dir)\n            test_results&#91;\"formatting\"] = test_formatting(extract_dir)\n            \n            # \u751f\u6210\u6d4b\u8bd5\u62a5\u544a\n            report = generate_test_report(test_results, temp_dir)\n            \n            # \u663e\u793a\u6700\u7ec8\u7ed3\u679c\n            print(\"\\n\" + \"=\" * 50)\n            print(\"\ud83d\udccb \u96c6\u6210\u6d4b\u8bd5\u5b8c\u6210\")\n            print(\"=\" * 50)\n            \n            success_count = sum(1 for result in test_results.values() if result&#91;0])\n            total_count = len(test_results)\n            \n            print(f\"\u6d4b\u8bd5\u901a\u8fc7: {success_count}\/{total_count}\")\n            \n            if report&#91;\"overall_success\"]:\n                print(\"\ud83c\udf89 \u6240\u6709\u6d4b\u8bd5\u901a\u8fc7\uff01\u60a8\u7684\u5de5\u4f5c\u6d41\u8fd0\u884c\u6b63\u5e38\")\n            else:\n                print(\"\u274c \u90e8\u5206\u6d4b\u8bd5\u5931\u8d25\uff0c\u8bf7\u68c0\u67e5\u4ee5\u4e0b\u95ee\u9898:\")\n                for recommendation in report&#91;\"recommendations\"]:\n                    print(f\"   {recommendation}\")\n            \n            return report&#91;\"overall_success\"]\n            \n        finally:\n            # \u6062\u590d\u539f\u59cb\u5de5\u4f5c\u76ee\u5f55\n            os.chdir(original_dir)\n\nif __name__ == \"__main__\":\n    success = main()\n    sys.exit(0 if success else 1)<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\"><strong>\u65b0\u7684\u4e09\u6b65\u6cd5\u63d0\u53d6\u6d41\u7a0b<\/strong><\/h2>\n\n\n\n<p>\u5728extract_content.py\u6587\u4ef6\u4e2d,\u6ce8\u610f\u5230\u4e00\u4e0b\u5b50\u662f\u63d0\u53d6\u51fa\u5168\u90e8\u7684\u6587\u4ef6\uff0c\u4e00\u4e0b\u5b50\u63d0\u53d6\u6587\u672c\u91cf\u5f88\u5927\uff0c\u80fd\u4e0d\u80fd\u8fdb\u884c\u91cd\u65b0\u601d\u8def\u8003\u8651\uff1f\u53ef\u4ee5\u5c06\u53d1\u73b0\u7684\u6587\u7ae0\u505a\u6210\u4e00\u4e2a\u5217\u8868\uff0c\u8fdb\u884c\u8fd4\u56de\uff0c\u7136\u540e\u518d\u8fdb\u884c\u8865\u5145\u53c2\u6570\uff0c\u6307\u5b9a\u8981\u63d0\u53d6\u7684\u6587\u7ae0\u3002\u5148\u63d0\u53d6\uff0c\u540e\u9762\u518d\u8fdb\u884c\u9a8c\u8bc1\u63d0\u53d6\u7ed3\u679c\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>1. \u53d1\u73b0\u9636\u6bb5 (discovery_phase.py)\n   \u2193\n2. \u9009\u62e9\u9636\u6bb5 (selection_phase.py) \n   \u2193\n3. \u63d0\u53d6\u9636\u6bb5 (extraction_phase.py)<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\"><strong>\u811a\u672c1\uff1a\u6587\u7ae0\u53d1\u73b0\u811a\u672c<\/strong>\u00a0discovery_phase.py<\/h2>\n\n\n\n<p>\u8fd9\u4e2a\u811a\u672c\u53ea\u83b7\u53d6\u6587\u7ae0\u5217\u8868\u548c\u57fa\u672c\u4fe1\u606f\uff0c\u4e0d\u4e0b\u8f7d\u5b8c\u6574\u5185\u5bb9\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># discovery_phase.py\nimport requests\nimport json\nimport os\nimport sys\nfrom datetime import datetime\nfrom urllib.parse import urljoin\n\ndef discover_posts(site_url, max_posts=50):\n    \"\"\"\n    \u53d1\u73b0WordPress\u7f51\u7ad9\u4e0a\u7684\u6587\u7ae0\uff0c\u53ea\u83b7\u53d6\u5143\u6570\u636e\n    \u8fd4\u56de\u6587\u7ae0\u5217\u8868\uff0c\u4e0d\u4e0b\u8f7d\u5b8c\u6574\u5185\u5bb9\n    \"\"\"\n    print(f\"\ud83d\udd0d \u6b63\u5728\u53d1\u73b0\u6587\u7ae0...\")\n    print(f\"   \u7f51\u7ad9: {site_url}\")\n    print(f\"   \u6700\u5927\u6587\u7ae0\u6570: {max_posts}\")\n    \n    posts_list = &#91;]\n    page = 1\n    per_page = min(20, max_posts)\n    \n    while len(posts_list) &lt; max_posts:\n        # \u6784\u5efaAPI URL\n        api_url = f\"{site_url}\/wp-json\/wp\/v2\/posts\"\n        params = {\n            'page': page,\n            'per_page': per_page,\n            '_fields': 'id,title,date,modified,slug,link,excerpt'\n        }\n        \n        try:\n            response = requests.get(api_url, params=params, timeout=30)\n            \n            if response.status_code != 200:\n                print(f\"\u274c API\u8bf7\u6c42\u5931\u8d25: {response.status_code}\")\n                break\n            \n            posts = response.json()\n            \n            if not posts:\n                print(\"\u2705 \u5df2\u83b7\u53d6\u6240\u6709\u53ef\u7528\u6587\u7ae0\")\n                break\n            \n            for post in posts:\n                post_info = {\n                    'id': post.get('id'),\n                    'title': clean_html(post.get('title', {}).get('rendered', '\u65e0\u6807\u9898')),\n                    'date': post.get('date'),\n                    'modified': post.get('modified'),\n                    'slug': post.get('slug'),\n                    'link': post.get('link'),\n                    'excerpt': clean_html(post.get('excerpt', {}).get('rendered', ''))&#91;:200] + '...',\n                    'selected': False  # \u7528\u4e8e\u540e\u7eed\u9009\u62e9\n                }\n                posts_list.append(post_info)\n            \n            print(f\"   \u5df2\u53d1\u73b0 {len(posts_list)} \u7bc7\u6587\u7ae0...\")\n            \n            # \u5982\u679c\u8fd4\u56de\u7684\u6587\u7ae0\u6570\u5c11\u4e8e\u8bf7\u6c42\u6570\uff0c\u8bf4\u660e\u6ca1\u6709\u66f4\u591a\u6587\u7ae0\u4e86\n            if len(posts) &lt; per_page:\n                break\n                \n            page += 1\n            \n        except requests.exceptions.RequestException as e:\n            print(f\"\u274c \u8bf7\u6c42\u9519\u8bef: {e}\")\n            break\n        except json.JSONDecodeError as e:\n            print(f\"\u274c JSON\u89e3\u6790\u9519\u8bef: {e}\")\n            break\n    \n    return posts_list\n\ndef clean_html(text):\n    \"\"\"\u6e05\u7406HTML\u6807\u7b7e\"\"\"\n    import re\n    if not text:\n        return \"\"\n    clean = re.compile('&lt;.*?>')\n    return re.sub(clean, '', text)\n\ndef save_discovery_results(posts_list, output_dir, site_url):  # \u4fee\u590d\uff1a\u6dfb\u52a0 site_url \u53c2\u6570\n    \"\"\"\u4fdd\u5b58\u53d1\u73b0\u7ed3\u679c\"\"\"\n    timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n    discovery_file = os.path.join(output_dir, f\"discovery_results_{timestamp}.json\")\n    \n    # \u521b\u5efa\u8f93\u51fa\u76ee\u5f55\n    os.makedirs(output_dir, exist_ok=True)\n    \n    # \u4fdd\u5b58\u5b8c\u6574\u53d1\u73b0\u7ed3\u679c\n    discovery_data = {\n        'discovery_time': datetime.now().isoformat(),\n        'total_posts': len(posts_list),\n        'site_url': site_url,  # \u4f7f\u7528\u4f20\u5165\u7684 site_url \u53c2\u6570\n        'posts': posts_list\n    }\n    \n    with open(discovery_file, 'w', encoding='utf-8') as f:\n        json.dump(discovery_data, f, ensure_ascii=False, indent=2)\n    \n    # \u521b\u5efa\u7b80\u5316\u7684\u9009\u62e9\u6587\u4ef6\uff08\u4fbf\u4e8e\u7528\u6237\u7f16\u8f91\uff09\n    selection_template = {\n        'site_url': site_url,  # \u4f7f\u7528\u4f20\u5165\u7684 site_url \u53c2\u6570\n        'discovery_file': discovery_file,\n        'selected_posts': &#91;]  # \u7528\u6237\u5c06\u5728\u8fd9\u91cc\u6dfb\u52a0\u8981\u63d0\u53d6\u7684\u6587\u7ae0ID\n    }\n    \n    selection_file = os.path.join(output_dir, f\"selection_template_{timestamp}.json\")\n    with open(selection_file, 'w', encoding='utf-8') as f:\n        json.dump(selection_template, f, ensure_ascii=False, indent=2)\n    \n    return discovery_file, selection_file\n\ndef display_posts_summary(posts_list):\n    \"\"\"\u663e\u793a\u6587\u7ae0\u6458\u8981\"\"\"\n    print(f\"\\n\ud83d\udccb \u53d1\u73b0\u7ed3\u679c\u6458\u8981\")\n    print(\"=\" * 50)\n    print(f\"\u603b\u6587\u7ae0\u6570: {len(posts_list)}\")\n    print(\"\\n\u524d10\u7bc7\u6587\u7ae0:\")\n    print(\"-\" * 50)\n    \n    for i, post in enumerate(posts_list&#91;:10]):\n        print(f\"{i+1:2d}. ID: {post&#91;'id']} - {post&#91;'title']}\")\n        print(f\"     \u65e5\u671f: {post&#91;'date']} | \u94fe\u63a5: {post&#91;'link']}\")\n        print(f\"     \u6458\u8981: {post&#91;'excerpt']}\")\n        print()\n    \n    if len(posts_list) > 10:\n        print(f\"... \u8fd8\u6709 {len(posts_list) - 10} \u7bc7\u6587\u7ae0\")\n\ndef main():\n    if len(sys.argv) &lt; 2:\n        print(\"\u4f7f\u7528\u65b9\u6cd5: python discovery_phase.py &lt;\u7f51\u7ad9\u5730\u5740> &#91;\u6700\u5927\u6587\u7ae0\u6570]\")\n        print(\"\u793a\u4f8b: python discovery_phase.py https:\/\/example.com 100\")\n        sys.exit(1)\n    \n    site_url = sys.argv&#91;1]\n    max_posts = int(sys.argv&#91;2]) if len(sys.argv) > 2 else 50\n    \n    print(\"=\" * 50)\n    print(\"\ud83d\udd0d WordPress\u6587\u7ae0\u53d1\u73b0\u5de5\u5177\")\n    print(\"=\" * 50)\n    \n    # \u53d1\u73b0\u6587\u7ae0\n    posts_list = discover_posts(site_url, max_posts)\n    \n    if not posts_list:\n        print(\"\u274c \u6ca1\u6709\u53d1\u73b0\u4efb\u4f55\u6587\u7ae0\")\n        sys.exit(1)\n    \n    # \u663e\u793a\u6458\u8981\n    display_posts_summary(posts_list)\n    \n    # \u4fdd\u5b58\u7ed3\u679c - \u4fee\u590d\uff1a\u4f20\u9012 site_url \u53c2\u6570\n    output_dir = \".\/discovery_results\"\n    discovery_file, selection_file = save_discovery_results(posts_list, output_dir, site_url)\n    \n    print(f\"\\n\ud83d\udcbe \u53d1\u73b0\u7ed3\u679c\u5df2\u4fdd\u5b58:\")\n    print(f\"   \u5b8c\u6574\u53d1\u73b0\u6570\u636e: {discovery_file}\")\n    print(f\"   \u9009\u62e9\u6a21\u677f\u6587\u4ef6: {selection_file}\")\n    \n    print(f\"\\n\ud83d\udcdd \u4e0b\u4e00\u6b65:\")\n    print(f\"   1. \u7f16\u8f91 {selection_file}\")\n    print(f\"   2. \u5728 'selected_posts' \u6570\u7ec4\u4e2d\u6dfb\u52a0\u8981\u63d0\u53d6\u7684\u6587\u7ae0ID\")\n    print(f\"   3. \u8fd0\u884c: python selection_phase.py {selection_file}\")\n\nif __name__ == \"__main__\":\n    main()<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>\u811a\u672c2\uff1a\u6587\u7ae0\u9009\u62e9\u811a\u672c<\/strong>\u00a0selection_phase.py<\/h3>\n\n\n\n<p>\u8fd9\u4e2a\u811a\u672c\u8ba9\u7528\u6237\u57fa\u4e8e\u53d1\u73b0\u7ed3\u679c\u9009\u62e9\u8981\u63d0\u53d6\u7684\u6587\u7ae0<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># selection_phase.py\nimport json\nimport os\nimport sys\nfrom datetime import datetime\n\ndef load_discovery_results(discovery_file):\n    \"\"\"\u52a0\u8f7d\u53d1\u73b0\u7ed3\u679c\"\"\"\n    try:\n        with open(discovery_file, 'r', encoding='utf-8') as f:\n            return json.load(f)\n    except Exception as e:\n        print(f\"\u274c \u52a0\u8f7d\u53d1\u73b0\u7ed3\u679c\u5931\u8d25: {e}\")\n        return None\n\ndef interactive_selection(discovery_data):\n    \"\"\"\u4ea4\u4e92\u5f0f\u9009\u62e9\u6587\u7ae0\"\"\"\n    posts = discovery_data&#91;'posts']\n    \n    print(\"=\" * 50)\n    print(\"\ud83d\udcdd \u6587\u7ae0\u9009\u62e9\u754c\u9762\")\n    print(\"=\" * 50)\n    print(f\"\u603b\u6587\u7ae0\u6570: {len(posts)}\")\n    print(\"\\n\u8bf7\u9009\u62e9\u8981\u63d0\u53d6\u7684\u6587\u7ae0:\")\n    print(\"   &#91;A] \u9009\u62e9\u5168\u90e8\u6587\u7ae0\")\n    print(\"   &#91;R] \u6309\u8303\u56f4\u9009\u62e9\")\n    print(\"   &#91;I] \u6309ID\u9009\u62e9\")\n    print(\"   &#91;L] \u5217\u51fa\u6240\u6709\u6587\u7ae0\")\n    print(\"   &#91;Q] \u5b8c\u6210\u9009\u62e9\")\n    \n    selected_ids = &#91;]\n    \n    while True:\n        choice = input(\"\\n\u8bf7\u9009\u62e9\u64cd\u4f5c (A\/R\/I\/L\/Q): \").strip().upper()\n        \n        if choice == 'A':\n            selected_ids = &#91;post&#91;'id'] for post in posts]\n            print(f\"\u2705 \u5df2\u9009\u62e9\u5168\u90e8 {len(selected_ids)} \u7bc7\u6587\u7ae0\")\n            break\n            \n        elif choice == 'R':\n            try:\n                start = int(input(\"\u8d77\u59cb\u5e8f\u53f7 (1-{}): \".format(len(posts))))\n                end = int(input(\"\u7ed3\u675f\u5e8f\u53f7 (1-{}): \".format(len(posts))))\n                if 1 &lt;= start &lt;= end &lt;= len(posts):\n                    selected_ids = &#91;posts&#91;i-1]&#91;'id'] for i in range(start, end+1)]\n                    print(f\"\u2705 \u5df2\u9009\u62e9\u6587\u7ae0 {start} \u5230 {end}\")\n                    break\n                else:\n                    print(\"\u274c \u8303\u56f4\u65e0\u6548\")\n            except ValueError:\n                print(\"\u274c \u8bf7\u8f93\u5165\u6709\u6548\u6570\u5b57\")\n                \n        elif choice == 'I':\n            try:\n                ids_input = input(\"\u8bf7\u8f93\u5165\u6587\u7ae0ID\uff08\u591a\u4e2aID\u7528\u9017\u53f7\u5206\u9694\uff09: \")\n                selected_ids = &#91;int(id_str.strip()) for id_str in ids_input.split(',')]\n                print(f\"\u2705 \u5df2\u9009\u62e9 {len(selected_ids)} \u7bc7\u6587\u7ae0\")\n                break\n            except ValueError:\n                print(\"\u274c \u8bf7\u8f93\u5165\u6709\u6548\u7684\u6570\u5b57ID\")\n                \n        elif choice == 'L':\n            print(\"\\n\ud83d\udccb \u6240\u6709\u6587\u7ae0\u5217\u8868:\")\n            print(\"-\" * 50)\n            for i, post in enumerate(posts, 1):\n                print(f\"{i:3d}. ID: {post&#91;'id']} - {post&#91;'title']}\")\n                \n        elif choice == 'Q':\n            if selected_ids:\n                break\n            else:\n                print(\"\u274c \u8bf7\u5148\u9009\u62e9\u6587\u7ae0\")\n        else:\n            print(\"\u274c \u65e0\u6548\u9009\u62e9\")\n    \n    return selected_ids\n\ndef update_selection_file(selection_file, selected_ids, discovery_data):\n    \"\"\"\u66f4\u65b0\u9009\u62e9\u6587\u4ef6\"\"\"\n    try:\n        with open(selection_file, 'r', encoding='utf-8') as f:\n            selection_data = json.load(f)\n        \n        # \u83b7\u53d6\u9009\u4e2d\u7684\u6587\u7ae0\u8be6\u60c5\n        selected_posts = &#91;]\n        all_posts = {post&#91;'id']: post for post in discovery_data&#91;'posts']}\n        \n        for post_id in selected_ids:\n            if post_id in all_posts:\n                selected_posts.append(all_posts&#91;post_id])\n        \n        selection_data&#91;'selected_posts'] = selected_posts\n        selection_data&#91;'selection_time'] = datetime.now().isoformat()\n        selection_data&#91;'total_selected'] = len(selected_posts)\n        \n        # \u4fdd\u5b58\u66f4\u65b0\u540e\u7684\u9009\u62e9\u6587\u4ef6\n        with open(selection_file, 'w', encoding='utf-8') as f:\n            json.dump(selection_data, f, ensure_ascii=False, indent=2)\n        \n        return selection_data\n        \n    except Exception as e:\n        print(f\"\u274c \u66f4\u65b0\u9009\u62e9\u6587\u4ef6\u5931\u8d25: {e}\")\n        return None\n\ndef main():\n    if len(sys.argv) != 2:\n        print(\"\u4f7f\u7528\u65b9\u6cd5: python selection_phase.py &lt;\u9009\u62e9\u6a21\u677f\u6587\u4ef6>\")\n        print(\"\u793a\u4f8b: python selection_phase.py .\/discovery_results\/selection_template_20240520_143022.json\")\n        sys.exit(1)\n    \n    selection_file = sys.argv&#91;1]\n    \n    if not os.path.exists(selection_file):\n        print(f\"\u274c \u9009\u62e9\u6587\u4ef6\u4e0d\u5b58\u5728: {selection_file}\")\n        sys.exit(1)\n    \n    print(\"=\" * 50)\n    print(\"\ud83d\udcdd WordPress\u6587\u7ae0\u9009\u62e9\u5de5\u5177\")\n    print(\"=\" * 50)\n    \n    # \u52a0\u8f7d\u9009\u62e9\u6587\u4ef6\n    try:\n        with open(selection_file, 'r', encoding='utf-8') as f:\n            selection_data = json.load(f)\n    except Exception as e:\n        print(f\"\u274c \u52a0\u8f7d\u9009\u62e9\u6587\u4ef6\u5931\u8d25: {e}\")\n        sys.exit(1)\n    \n    discovery_file = selection_data.get('discovery_file')\n    if not discovery_file or not os.path.exists(discovery_file):\n        print(f\"\u274c \u53d1\u73b0\u6587\u4ef6\u4e0d\u5b58\u5728: {discovery_file}\")\n        sys.exit(1)\n    \n    # \u52a0\u8f7d\u53d1\u73b0\u7ed3\u679c\n    discovery_data = load_discovery_results(discovery_file)\n    if not discovery_data:\n        sys.exit(1)\n    \n    # \u4ea4\u4e92\u5f0f\u9009\u62e9\n    selected_ids = interactive_selection(discovery_data)\n    \n    if not selected_ids:\n        print(\"\u274c \u6ca1\u6709\u9009\u62e9\u4efb\u4f55\u6587\u7ae0\")\n        sys.exit(1)\n    \n    # \u66f4\u65b0\u9009\u62e9\u6587\u4ef6\n    updated_selection = update_selection_file(selection_file, selected_ids, discovery_data)\n    \n    if updated_selection:\n        print(f\"\\n\u2705 \u9009\u62e9\u5b8c\u6210!\")\n        print(f\"   \u5df2\u9009\u62e9 {len(selected_ids)} \u7bc7\u6587\u7ae0\")\n        print(f\"   \u9009\u62e9\u6587\u4ef6\u5df2\u66f4\u65b0: {selection_file}\")\n        print(f\"\\n\ud83d\ude80 \u4e0b\u4e00\u6b65:\")\n        print(f\"   \u8fd0\u884c: python extraction_phase.py {selection_file}\")\n    else:\n        print(\"\u274c \u9009\u62e9\u5931\u8d25\")\n\nif __name__ == \"__main__\":\n    main()<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\"><strong>\u811a\u672c3\uff1a\u5185\u5bb9\u63d0\u53d6\u811a\u672c<\/strong>\u00a0<code>extraction_phase.py<\/code><\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code># extraction_phase.py\nimport requests\nimport json\nimport os\nimport sys\nfrom datetime import datetime\nfrom urllib.parse import urljoin\nimport time\n\ndef load_selection_data(selection_file):\n    \"\"\"\u52a0\u8f7d\u9009\u62e9\u6570\u636e\"\"\"\n    try:\n        with open(selection_file, 'r', encoding='utf-8') as f:\n            return json.load(f)\n    except Exception as e:\n        print(f\"\u274c \u52a0\u8f7d\u9009\u62e9\u6570\u636e\u5931\u8d25: {e}\")\n        return None\n\ndef extract_post_content(site_url, post_id):\n    \"\"\"\u63d0\u53d6\u5355\u7bc7\u6587\u7ae0\u7684\u5b8c\u6574\u5185\u5bb9\"\"\"\n    api_url = f\"{site_url}\/wp-json\/wp\/v2\/posts\/{post_id}\"\n    \n    try:\n        response = requests.get(api_url, timeout=30)\n        \n        if response.status_code == 200:\n            return response.json()\n        else:\n            print(f\"\u274c \u63d0\u53d6\u6587\u7ae0 {post_id} \u5931\u8d25: {response.status_code}\")\n            return None\n            \n    except requests.exceptions.RequestException as e:\n        print(f\"\u274c \u8bf7\u6c42\u6587\u7ae0 {post_id} \u5931\u8d25: {e}\")\n        return None\n\ndef save_extracted_content(post_data, output_dir):\n    \"\"\"\u4fdd\u5b58\u63d0\u53d6\u7684\u5185\u5bb9\"\"\"\n    post_id = post_data.get('id')\n    post_slug = post_data.get('slug', f'post_{post_id}')\n    \n    # \u521b\u5efa\u6587\u7ae0\u76ee\u5f55\n    post_dir = os.path.join(output_dir, f\"post_{post_id}_{post_slug}\")\n    os.makedirs(post_dir, exist_ok=True)\n    \n    # \u4fdd\u5b58\u539f\u59cbJSON\u6570\u636e\n    raw_file = os.path.join(post_dir, \"raw_data.json\")\n    with open(raw_file, 'w', encoding='utf-8') as f:\n        json.dump(post_data, f, ensure_ascii=False, indent=2)\n    \n    # \u63d0\u53d6\u5e76\u4fdd\u5b58\u7eaf\u6587\u672c\u5185\u5bb9\n    title = clean_html(post_data.get('title', {}).get('rendered', '\u65e0\u6807\u9898'))\n    content = clean_html(post_data.get('content', {}).get('rendered', ''))\n    excerpt = clean_html(post_data.get('excerpt', {}).get('rendered', ''))\n    \n    text_file = os.path.join(post_dir, \"content.txt\")\n    with open(text_file, 'w', encoding='utf-8') as f:\n        f.write(f\"\u6807\u9898: {title}\\n\")\n        f.write(f\"\u65e5\u671f: {post_data.get('date')}\\n\")\n        f.write(f\"\u4fee\u6539\u65f6\u95f4: {post_data.get('modified')}\\n\")\n        f.write(f\"\u94fe\u63a5: {post_data.get('link')}\\n\")\n        f.write(f\"\u6458\u8981: {excerpt}\\n\")\n        f.write(\"\\n\" + \"=\"*50 + \"\\n\")\n        f.write(\"\u6b63\u6587\u5185\u5bb9:\\n\")\n        f.write(\"=\"*50 + \"\\n\\n\")\n        f.write(content)\n    \n    # \u521b\u5efa\u6587\u7ae0\u5143\u6570\u636e\n    meta = {\n        'extraction_time': datetime.now().isoformat(),\n        'post_id': post_id,\n        'post_slug': post_slug,\n        'title': title,\n        'date': post_data.get('date'),\n        'link': post_data.get('link'),\n        'content_length': len(content),\n        'word_count': len(content.split())\n    }\n    \n    meta_file = os.path.join(post_dir, \"metadata.json\")\n    with open(meta_file, 'w', encoding='utf-8') as f:\n        json.dump(meta, f, ensure_ascii=False, indent=2)\n    \n    return {\n        'post_id': post_id,\n        'title': title,\n        'content_length': len(content),\n        'word_count': len(content.split()),\n        'output_dir': post_dir\n    }\n\ndef clean_html(text):\n    \"\"\"\u6e05\u7406HTML\u6807\u7b7e\"\"\"\n    import re\n    if not text:\n        return \"\"\n    clean = re.compile('&lt;.*?>')\n    text = re.sub(clean, '', text)\n    # \u6e05\u7406\u591a\u4f59\u7684\u7a7a\u767d\u5b57\u7b26\n    text = re.sub(r'\\n\\s*\\n', '\\n\\n', text)\n    text = re.sub(r'&#91; \\t]+', ' ', text)\n    return text.strip()\n\ndef create_extraction_summary(extraction_results, selection_data, output_dir):\n    \"\"\"\u521b\u5efa\u63d0\u53d6\u6458\u8981\"\"\"\n    summary = {\n        'extraction_time': datetime.now().isoformat(),\n        'site_url': selection_data.get('site_url'),\n        'total_selected': selection_data.get('total_selected', 0),\n        'successfully_extracted': len(extraction_results),\n        'failed_extractions': selection_data.get('total_selected', 0) - len(extraction_results),\n        'extraction_results': extraction_results,\n        'output_directory': output_dir\n    }\n    \n    summary_file = os.path.join(output_dir, \"extraction_summary.json\")\n    with open(summary_file, 'w', encoding='utf-8') as f:\n        json.dump(summary, f, ensure_ascii=False, indent=2)\n    \n    return summary_file\n\ndef main():\n    if len(sys.argv) != 2:\n        print(\"\u4f7f\u7528\u65b9\u6cd5: python extraction_phase.py &lt;\u9009\u62e9\u6587\u4ef6>\")\n        print(\"\u793a\u4f8b: python extraction_phase.py .\/discovery_results\/selection_template_20240520_143022.json\")\n        sys.exit(1)\n    \n    selection_file = sys.argv&#91;1]\n    \n    print(\"=\" * 50)\n    print(\"\ud83d\udce5 WordPress\u5185\u5bb9\u63d0\u53d6\u5de5\u5177\")\n    print(\"=\" * 50)\n    \n    # \u52a0\u8f7d\u9009\u62e9\u6570\u636e\n    selection_data = load_selection_data(selection_file)\n    if not selection_data:\n        sys.exit(1)\n    \n    site_url = selection_data.get('site_url')\n    selected_posts = selection_data.get('selected_posts', &#91;])\n    \n    if not selected_posts:\n        print(\"\u274c \u6ca1\u6709\u9009\u62e9\u4efb\u4f55\u6587\u7ae0\")\n        sys.exit(1)\n    \n    print(f\"\ud83c\udf10 \u7f51\u7ad9: {site_url}\")\n    print(f\"\ud83d\udcdd \u51c6\u5907\u63d0\u53d6 {len(selected_posts)} \u7bc7\u6587\u7ae0\")\n    \n    # \u521b\u5efa\u8f93\u51fa\u76ee\u5f55\n    timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n    output_dir = f\".\/extracted_content_{timestamp}\"\n    os.makedirs(output_dir, exist_ok=True)\n    \n    extraction_results = &#91;]\n    failed_extractions = &#91;]\n    \n    # \u9010\u7bc7\u63d0\u53d6\u6587\u7ae0\n    for i, post_info in enumerate(selected_posts, 1):\n        post_id = post_info&#91;'id']\n        post_title = post_info&#91;'title']\n        \n        print(f\"\\n&#91;{i}\/{len(selected_posts)}] \u63d0\u53d6\u6587\u7ae0: {post_title}\")\n        \n        # \u63d0\u53d6\u5185\u5bb9\n        post_data = extract_post_content(site_url, post_id)\n        \n        if post_data:\n            # \u4fdd\u5b58\u5185\u5bb9\n            result = save_extracted_content(post_data, output_dir)\n            extraction_results.append(result)\n            print(f\"   \u2705 \u63d0\u53d6\u6210\u529f: {result&#91;'word_count']} \u5b57\")\n        else:\n            failed_extractions.append(post_id)\n            print(f\"   \u274c \u63d0\u53d6\u5931\u8d25\")\n        \n        # \u6dfb\u52a0\u77ed\u6682\u5ef6\u8fdf\uff0c\u907f\u514d\u5bf9\u670d\u52a1\u5668\u9020\u6210\u538b\u529b\n        time.sleep(0.5)\n    \n    # \u521b\u5efa\u63d0\u53d6\u6458\u8981\n    summary_file = create_extraction_summary(extraction_results, selection_data, output_dir)\n    \n    # \u663e\u793a\u7ed3\u679c\u6458\u8981\n    print(f\"\\n\" + \"=\" * 50)\n    print(\"\ud83d\udcca \u63d0\u53d6\u7ed3\u679c\u6458\u8981\")\n    print(\"=\" * 50)\n    print(f\"\u2705 \u6210\u529f\u63d0\u53d6: {len(extraction_results)} \u7bc7\")\n    print(f\"\u274c \u63d0\u53d6\u5931\u8d25: {len(failed_extractions)} \u7bc7\")\n    print(f\"\ud83d\udcc1 \u8f93\u51fa\u76ee\u5f55: {output_dir}\")\n    print(f\"\ud83d\udcc4 \u6458\u8981\u6587\u4ef6: {summary_file}\")\n    \n    if failed_extractions:\n        print(f\"\\n\u5931\u8d25\u7684\u6587\u7ae0ID: {failed_extractions}\")\n    \n    print(f\"\\n\ud83c\udf89 \u63d0\u53d6\u5b8c\u6210!\")\n\nif __name__ == \"__main__\":\n    main()<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">\u00a0\u65b0\u7684\u4f7f\u7528\u6d41\u7a0b<\/h3>\n\n\n\n<p><strong>\u7b2c\u4e00\u6b65\uff1a\u53d1\u73b0\u6587\u7ae0<\/strong><\/p>\n\n\n\n<p>python discovery_phase.py https:\/\/\u4f60\u7684\u7f51\u7ad9\u5730\u5740 100<\/p>\n\n\n\n<p><strong>\u7b2c\u4e8c\u6b65\uff1a\u9009\u62e9\u6587\u7ae0<\/strong><\/p>\n\n\n\n<p>python selection_phase.py .\/discovery_results\/selection_template_\u65f6\u95f4\u6233.json<\/p>\n\n\n\n<p><strong>\u7b2c\u4e09\u6b65\uff1a\u63d0\u53d6\u5185\u5bb9<\/strong><\/p>\n\n\n\n<p>python extraction_phase.py .\/discovery_results\/selection_template_\u65f6\u95f4\u6233.json<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u811a\u672c\u540d\u79f0 \u4e3b\u8981\u529f\u80fd \u5f53\u524d\u72b6\u6001 \u5907\u6ce8 check_environment.py \u68c0\u67e5Python\u73af\u5883\u548c\u4f9d\u8d56\u5305 \u5b8c [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[28,55,29],"tags":[97,96],"class_list":["post-984","post","type-post","status-publish","format-standard","hentry","category-tech-writing","category-technology","category-efficient-workflow","tag-python","tag-wordpress"],"_links":{"self":[{"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/posts\/984","targetHints":{"allow":["GET"]}}],"collection":[{"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/comments?post=984"}],"version-history":[{"count":5,"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/posts\/984\/revisions"}],"predecessor-version":[{"id":992,"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/posts\/984\/revisions\/992"}],"wp:attachment":[{"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/media?parent=984"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/categories?post=984"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.preluna.xyz\/index.php\/wp-json\/wp\/v2\/tags?post=984"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}