add crawling files.

2024-10-23 08:27:29 +08:00
parent a0f9f3ba21
commit a2fb937b8f
15 changed files with 3275 additions and 0 deletions
--- a/scripts/get_u9a9.py
+++ b/scripts/get_u9a9.py
@ -0,0 +1,117 @@
+"""
+Script Name: 
+Description: 获取 u9a9 数据， prompt:
+    我们需要访问 https://u9a9.org/?type=2&search={q}&p=4 这个地址，并返回数据，以下是需求详细描述：
+    q 参数，我们有一个数组，分别是 qlist = ['[BD', '合集2'] 
+    p 参数，是要访问的页码，它通常从1开始。
+
+    我们循环遍历 qlist，对每一个值，从 p=1 开始，组成一个访问的 URL， 获取该 URL 的内容，它是一个页面，页面结构简化之后，就是我刚才发给你的内容。我们需要做的是：
+    解析 tbody 标签中的若干个 tr，对每个 tr，获取第二个 td 中的 title 文本，并去掉 [BD/{}] 的部分，记为title；
+        获取第三个td中的第一个链接，它是一个 .torrent 文件，我们下载它，命名为 {title}..torrent ;
+    然后我们解析 <div class="center"> 中的内容，它是一个页码导航，我们只需要关注 li 中文本为 >> 的这一行，解析出 href 字段，并取出 p 值，这个值与上面的URL拼起来，就是我们要访问的下一页。如果没有 匹配到这一行，那就代表访问结束了。
+
+    请你理解上面的需求，并写出相应的 python脚本。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import requests
+from bs4 import BeautifulSoup
+import re
+import os
+import time
+
+# 模拟头部，避免被认为是爬虫
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
+}
+
+# 定义搜索词数组
+qlist = ['[BD']
+
+# 定义下载路径
+download_path = "./torrents/"
+if not os.path.exists(download_path):
+    os.makedirs(download_path)
+
+def download_torrent(torrent_url, title):
+    try:
+        # 获取 .torrent 文件
+        response = requests.get(torrent_url, headers=headers, stream=True)
+        torrent_file_name = f"{title}.torrent"
+        torrent_path = os.path.join(download_path, torrent_file_name)
+        
+        # 保存文件
+        with open(torrent_path, 'wb') as f:
+            f.write(response.content)
+        print(f"Downloaded: {torrent_file_name}")
+    except Exception as e:
+        print(f"Error downloading {torrent_url}: {str(e)}")
+
+# 解析页面内容
+def parse_page(html_content):
+    soup = BeautifulSoup(html_content, 'html.parser')
+    
+    # 获取 tbody 标签中的所有 tr 行
+    tbody = soup.find('tbody')
+    rows = tbody.find_all('tr', class_='default')
+
+    for row in rows:
+        # 获取第二个td中的标题文本，并去掉 [BD/{}] 部分
+        title_td = row.find_all('td')[1]
+        raw_title = title_td.find('a')['title'].strip()
+        #title = re.sub(r'\[BD/\d+\.\d+G\]', '', raw_title).strip()
+        title = re.sub(r'\[.*?\]', '', raw_title).strip()
+        
+        # 获取第三个td中的第一个链接
+        magnet_td = row.find_all('td')[2]
+        torrent_link = magnet_td.find('a', href=re.compile(r'.torrent'))['href']
+        # 拼接完整的链接并移除 host 中的 '-'
+        full_torrent_link = f"https:{torrent_link}".replace('-', '')
+        
+        # 下载 torrent 文件
+        download_torrent(full_torrent_link, title)
+        time.sleep(3)  # 避免请求过快
+    
+    # 解析页码导航，获取下一页链接
+    pagination = soup.find('div', class_='center').find('nav').find('ul', class_='pagination')
+    next_page = pagination.find('a', text='»')
+    
+    if next_page:
+        next_page_url = next_page['href']
+        next_p_value = re.search(r'p=(\d+)', next_page_url).group(1)
+        return next_p_value
+    return None
+
+# 爬取指定 q 和 p 的页面
+def scrape(q, start_p=1):
+    p = start_p
+    while True:
+        url = f"https://u9a9.org/?type=2&search={q}&p={p}"
+        print(f"Fetching URL: {url}")
+        response = requests.get(url, headers=headers)
+
+        if response.status_code != 200:
+            print(f"Failed to fetch {url}")
+            break
+
+        next_p = parse_page(response.text)
+
+        if next_p:
+            p = next_p
+        else:
+            print(f"No more pages for query {q}.")
+            break
+
+# 循环遍历 qlist
+for q in qlist:
+    scrape(q, start_p=1)