统一格式(无实质改动) & 添加了图片 (#23)

* 修复了style和script标签无法去除的问题 * 添加了图片展示 & 统一了引号格式 * 添加了图片展示 & 统一了引号格式 * 统一引号格式并异步处理函数
2026-07-31 16:44:20 +00:00 · 2024-12-12 13:47:28 +08:00
parent 6a4b0bbd0d
commit b939a48b0b
6 changed files with 74 additions and 65 deletions
@@ -6,7 +6,7 @@ import urllib.parse
 from bs4 import BeautifulSoup

 headers = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
 }

 async def get_async_data (url):
@@ -19,7 +19,7 @@ async def search(msg : str, num : int):

    url = "https://mzh.moegirl.org.cn/index.php?search=" + urllib.parse.quote_plus(msg)
    response = await get_async_data(url)
-    logger.success(f"连接{url}完成, 状态码 : {response.status_code}")
+    logger.success(f"连接\"{url}\"完成, 状态码 : {response.status_code}")

    # 正常搜索
    if response.status_code == 200:
@@ -35,25 +35,25 @@ async def search(msg : str, num : int):
                └── li ...
                └── li ...
        """
-        soup = BeautifulSoup(response.text, 'html.parser')
+        soup = BeautifulSoup(response.text, "html.parser")

        # 检测ul.mw-search-results, 是否有结果
-        if soup.find('ul', class_='mw-search-results'):
-            ul_tag = soup.select('ul.mw-search-results')[0]
-            li_tags = ul_tag.select('li')
+        ul_tag = soup.find("ul", class_ = "mw-search-results")
+        if ul_tag:
+            li_tags = ul_tag.find_all("li")
            for li_tag in li_tags:

-                div_heading = li_tag.select('div.mw-search-result-heading')[0]
+                div_heading = li_tag.find("div", class_ = "mw-search-result-heading")
                if div_heading:
-                    a_tag = div_heading.select('a')[0]
-                    result += a_tag['title'] + "\n"
-                    logger.info(f"搜索到 : \"{a_tag['title']}\"")
+                    a_tag = div_heading.find("a")
+                    result += a_tag["title"] + "\n"
+                    logger.info(f"搜索到 : \"{a_tag["title"]}\"")

-                div_result = li_tag.find('div', class_='searchresult')
+                div_result = li_tag.find("div", class_="searchresult")
                if div_result:
-                    content = str(div_result).replace('<div class=\"searchresult\">', '').replace('</div>', '')
-                    content = content.replace('<span class=\"searchmatch\">', '').replace('</span>', '')
-                    result += content + "\n\n"
+                    content = str(div_result).replace("<div class=\"searchresult\">", "").replace("</div>", "")
+                    content = content.replace("<span class=\"searchmatch\">", "").replace("</span>", "")
+                    result += content + "\n"

                num -= 1
                if num == 0:
@@ -67,10 +67,10 @@ async def search(msg : str, num : int):

    # 重定向
    elif response.status_code == 302:
-        logger.info(f"\"{msg}\"已被重定向至\"{response.headers.get('location')}\"")
+        logger.info(f"\"{msg}\"已被重定向至\"{response.headers.get("location")}\"")
        # 读取重定向结果
-        response = await get_async_data(response.headers.get('location'))
-        soup = BeautifulSoup(response.text, 'html.parser')
+        response = await get_async_data(response.headers.get("location"))
+        soup = BeautifulSoup(response.text, "html.parser")
        logger.success("重定向成功")
        num = 0

@@ -83,19 +83,28 @@ async def search(msg : str, num : int):
                └── p                   # 人物介绍
                └── ...
        """
-        if soup.find('div', class_='mw-parser-output'):
-            div = soup.find('div', class_='mw-parser-output')
-            p_tags = div.select('p')
+
+        result += msg + "\n"
+        img = soup.find("img", class_="infobox-image")
+        if img:
+            logger.info(f"照片{img["src"]}")
+            result += f"![ {msg} ]( {img["src"]} ) \n"
+
+        div = soup.find("div", class_="mw-parser-output")
+        if div:
+            p_tags = div.find_all("p")
            for p_tag in p_tags:
                p = str(p_tag)
-                p = re.sub(r'<.*?>', '', p)
-                if p != '':
+                p = re.sub(r"<script.*?</script>|<style.*?</style>", "", p, flags=re.DOTALL)
+                p = re.sub(r"<.*?>", "", p, flags = re.DOTALL)
+                p = re.sub(r"\[.*?]", "", p, flags = re.DOTALL)
+                if p != "":
                    result += str(p)

                    num += 1
                    if num >= 5:
                        break
-            return result
+        return result

    # 状态码非200或302
    else: