diff --git a/scrapy_proj/cron/cmd.txt b/scrapy_proj/cron/cmd.txt index 787b905..8a305e1 100644 --- a/scrapy_proj/cron/cmd.txt +++ b/scrapy_proj/cron/cmd.txt @@ -1,8 +1,11 @@ scrapy crawl clm -a mod='update' -a begin='2025-07-10' -s STATS_EXPORT_INTERVAL=180 -a query_str="groups='actress' and tags like '%vixen%' " scrapy crawl clm -a mod='update' -a begin='2025-07-10' -s STATS_PUSH_MSG=False scrapy crawl clm -a mod='reload' -s STATS_PUSH_MSG=False -a file_path=./scrapy_proj/data/clm_keywords.json +scrapy crawl clm -a begin='2025-07-15' -a mod='update' -a query_str="groups not like '%actress%'" -s STATS_PUSH_MSG=False +scrapy crawl clm -a mod='all' -a query_str="groups not like '%actress%'" -s STATS_PUSH_MSG=False -scrapy crawl u3c3 -a begin='2025-07-04' end='2024-07-12' +scrapy crawl u3c3 -a begin='2025-07-04' -a end='2024-07-12' +scrapy crawl u3c3 -a begin='2023-08-10' -a end='2024-10-15' -s STATS_PUSH_MSG=False scrapy crawl pbox -a mod='update' -a begin='2025-07-16' scrapy crawl pbox -a debug=1 -a cmd='studio,movies' diff --git a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py index 0570ec6..7d28d27 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py +++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py @@ -415,6 +415,10 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类 else: logging.warning(f"不支持的条件类型: {condition_type},键: {key}") + # 处理直接传入的 query_str + if "query_str" in filters: + sql += f" AND ({filters['query_str']})" + # 处理排序(基于校验后的valid_order_fields) if "order_by" in valid_filters: sql += f" ORDER BY {valid_filters['order_by']}" diff --git a/scrapy_proj/scrapy_proj/spiders/sis_spider.py b/scrapy_proj/scrapy_proj/spiders/sis_spider.py index 0a7d15e..a361eb0 100644 --- a/scrapy_proj/scrapy_proj/spiders/sis_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/sis_spider.py @@ -10,6 +10,35 @@ class Sis001Spider(BaseSpider): name = SPIDER_NAME_SIS allowed_domains = ["sis001.com"] + # 配置请求头(复用curl中的头部信息) + custom_settings = { + "DEFAULT_REQUEST_HEADERS": { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", + # Cookie直接复用curl中的值(注意:Cookie有过期时间,需定期从浏览器更新) + "cookie": "cdb2_sid=ZYBNN5; cdb2_uvStat=1757123386; cdb2_fuvs=25; cf_clearance=yTEYSenVbpSFycZ.Ru.XSWP8ioz0bAJiuI0LlKeJ73Q-1757123388-1.2.1.1-J2vSisVKrWSEjSMJHhXKIrgyWPzbj7s2iHLZuYkpdP8fHyqaMeq1.Q0Y4X7Lc4eSWuMz2PTn8I2pfnayrs3SdLWQ8lMTWF0.uYnOewGzf7sX9t2xkTYtD.Y99JUmkKOetkB9gi6afriLNVwgYkuE.P3nH9x8HNGW.tal8hXg.tZk8ZwwLI_4LkbX8ah4Ir7LuyjKiodZzrn8FT3aUaAy9R1iSOCIibjd.cinPjSZWrcNxDYOXH4MV5m8OM4J_iy5", + "priority": "u=0, i", + "referer": "https://sis001.com/", # Referer与第一个请求对应,后续可动态调整 + "sec-ch-ua": "\"Not;A=Brand\";v=\"99\", \"Microsoft Edge\";v=\"139\", \"Chromium\";v=\"139\"", + "sec-ch-ua-arch": "\"arm\"", + "sec-ch-ua-bitness": "\"64\"", + "sec-ch-ua-full-version": "\"139.0.3405.119\"", + "sec-ch-ua-full-version-list": "\"Not;A=Brand\";v=\"99.0.0.0\", \"Microsoft Edge\";v=\"139.0.3405.119\", \"Chromium\";v=\"139.0.7258.139\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-model": "\"\"", + "sec-ch-ua-platform": "\"macOS\"", + "sec-ch-ua-platform-version": "\"14.6.1\"", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + # User-Agent严格匹配浏览器,避免被识别为爬虫 + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0" + }, + "COOKIES_ENABLED": True # 启用Cookie支持 + } + def __init__(self, debug='False', begin=None, *args, **kwargs): super().__init__(*args, **kwargs) self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False @@ -45,14 +74,29 @@ class Sis001Spider(BaseSpider): 'ident' : 'forum_77' }, ] + sections2 = [ + { + 'plate' : 'sis_asia_yc_2', + 'plate_name' : '亚有原创', + 'url' : 'https://sis001.com/forum/forum-230-1.html', + 'ident' : 'forum_230' + }, + { + 'plate' : 'sis_asia_zt_2', + 'plate_name' : '亚有转帖', + 'url' : 'https://sis001.com/forum/forum-58-1.html', + 'ident' : 'forum_58' + }, + ] for item in sections: - yield scrapy.Request(item['url'], callback=self.parse_page_common, meta=item) + yield scrapy.Request(item['url'], headers = self.custom_settings.get("DEFAULT_REQUEST_HEADERS"), callback=self.parse_page_common, meta=item) def parse_page_common(self, response): ident = response.meta['ident'] plate_name = response.meta['plate_name'] + self.logger.debug(f"url: {response.url}, response: {response.text[:1000]}") # 查找目标表格 tables = response.css(f'table#{ident}') if not tables: diff --git a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py index a839887..8da4789 100644 --- a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py @@ -7,8 +7,9 @@ from scrapy_proj.comm.comm_def import SPIDER_NAME_U3C3 class U001Spider(BaseSpider): name = SPIDER_NAME_U3C3 - allowed_domains = ["u001.25img.com"] - start_urls = ["https://u001.25img.com/?p=1"] + allowed_domains = ["u001.25img.com", 'u9a9.com'] + start_urls = ["https://u001.25img.com/?p=1", 'https://u9a9.com/?type=2&p=1'] + #start_urls = ['https://u9a9.com/?type=2&p=1'] def __init__(self, debug='False', begin=None, *args, **kwargs): super().__init__(*args, **kwargs) @@ -46,11 +47,32 @@ class U001Spider(BaseSpider): yield item if need_next : - # 翻页逻辑 - current_page = int(response.url.split('=')[-1]) - total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)')) - if current_page < total_pages: - if self.debug and current_page >= 5: - self.logger.info(f"debug mod. stop crawling.") - else: - yield response.follow(f"?p={current_page + 1}", self._parse) \ No newline at end of file + if "u9a9" in response.url: + pagination_links = response.css('ul.pagination li a') + next_page_url = None + # 2. 遍历链接,检查文本是否为»(忽略前后空格) + for link in pagination_links: + # 提取a标签的文本(使用get()获取单个值,默认返回None) + link_text = link.css('::text').get() or '' + # 去除文本前后空格,判断是否等于» + if link_text.strip() == '»': + # 提取href属性(相对路径) + next_page_rel = link.css('::attr(href)').get() + if next_page_rel: + # 拼接完整URL + next_page_url = response.urljoin(next_page_rel) + break # 找到后退出循环 + if next_page_url: + if self.debug and 'p=5' in next_page_url: + self.logger.info(f"debug mod. stop crawling.") + else: + yield response.follow(next_page_url, self._parse) + else: + # 翻页逻辑 + current_page = int(response.url.split('=')[-1]) + total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)')) + if current_page < total_pages: + if self.debug and current_page >= 5: + self.logger.info(f"debug mod. stop crawling.") + else: + yield response.follow(f"?p={current_page + 1}", self._parse) \ No newline at end of file