这个格式，那么读取它的下一行，去掉空格与tab符号之后，会剩下一个日期字符串，把这个字符串记为 pubdate；我们会得到 str1__pubdate__str2 这样的文本，把它保存到一个变量 res 中；继续遍历页面，如果找到匹配下一頁格式的一行，说明还有下一页，把其中的 page=5 的数字提取出来，修改上面的 URL，填入新的 p值，继续访问；如果无法匹配，那就代表着结束，我们把 res 输出到一个文件中，它命名为 {q}_all.txt 请你理解上述需求，并写出对应的python代码。 Author: [Your Name] Created Date: YYYY-MM-DD Last Modified: YYYY-MM-DD Version: 1.0 Modification History: - YYYY-MM-DD [Your Name]: - YYYY-MM-DD [Your Name]: - YYYY-MM-DD [Your Name]: """ import requests #from bs4 import BeautifulSoup import re import time # 参数定义 qlist = ['MKBD', 'LAFBD', 'S2MBD', 'SKYHD', 'SMBD', 'CWPBD', 'DRGBD', 'DSAMBD'] base_url = "https://javdb.com/search?f=all&page={}&q={}" # 临时跑数据 qlist = ['SMBD', 'CWPBD', 'DRGBD', 'DSAMBD'] # 正则表达式匹配模式 title_pattern = r'

(.*?)\s*(.*?)

' meta_pattern = r'

' next_page_pattern = r' if re.search(meta_pattern, line): meta_found = True continue # 如果上一行是

，则处理下一行的日期 if meta_found: pubdate = line.strip() meta_found = False # 如果标题和日期都匹配到了，存储结果 if str1 and str2 and pubdate: res.append(f"{str1}__{pubdate}__{str2}") str1, str2, pubdate = None, None, None # 尝试匹配下一页链接 next_page_match = re.search(next_page_pattern, line) if next_page_match: next_page = next_page_match.group(1) return res, next_page def scrape_videos_for_q(q): """对指定的q参数进行抓取""" p = 1 res = [] while True: # 构建 URL url = base_url.format(p, q) print(f"正在访问：{url}") page_content = get_page_content(url) if page_content: # 解析页面内容 results, next_page = parse_page(page_content) res.extend(results) # 如果有下一页，继续，否则结束 if next_page: p = int(next_page) time.sleep(5) # 避免请求过快 else: break else: print(f"未能获取页面内容，跳过 q={q} 的处理") break # 将结果保存到文件中 if res: output_filename = f"./javdb/{q}_all.txt" with open(output_filename, 'w', encoding='utf-8') as f: f.write("\n".join(res)) print(f"已保存结果到 {output_filename}") def main(): """主函数""" for q in qlist: scrape_videos_for_q(q) if __name__ == "__main__": main()