modify scripts

This commit is contained in:
2025-10-15 16:21:20 +08:00
parent 265504632c
commit e81ca8a3a4
4 changed files with 85 additions and 12 deletions

View File

@ -1,8 +1,11 @@
scrapy crawl clm -a mod='update' -a begin='2025-07-10' -s STATS_EXPORT_INTERVAL=180 -a query_str="groups='actress' and tags like '%vixen%' "
scrapy crawl clm -a mod='update' -a begin='2025-07-10' -s STATS_PUSH_MSG=False
scrapy crawl clm -a mod='reload' -s STATS_PUSH_MSG=False -a file_path=./scrapy_proj/data/clm_keywords.json
scrapy crawl clm -a begin='2025-07-15' -a mod='update' -a query_str="groups not like '%actress%'" -s STATS_PUSH_MSG=False
scrapy crawl clm -a mod='all' -a query_str="groups not like '%actress%'" -s STATS_PUSH_MSG=False
scrapy crawl u3c3 -a begin='2025-07-04' end='2024-07-12'
scrapy crawl u3c3 -a begin='2025-07-04' -a end='2024-07-12'
scrapy crawl u3c3 -a begin='2023-08-10' -a end='2024-10-15' -s STATS_PUSH_MSG=False
scrapy crawl pbox -a mod='update' -a begin='2025-07-16'
scrapy crawl pbox -a debug=1 -a cmd='studio,movies'

View File

@ -415,6 +415,10 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类
else:
logging.warning(f"不支持的条件类型: {condition_type},键: {key}")
# 处理直接传入的 query_str
if "query_str" in filters:
sql += f" AND ({filters['query_str']})"
# 处理排序基于校验后的valid_order_fields
if "order_by" in valid_filters:
sql += f" ORDER BY {valid_filters['order_by']}"

View File

@ -10,6 +10,35 @@ class Sis001Spider(BaseSpider):
name = SPIDER_NAME_SIS
allowed_domains = ["sis001.com"]
# 配置请求头复用curl中的头部信息
custom_settings = {
"DEFAULT_REQUEST_HEADERS": {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
# Cookie直接复用curl中的值注意Cookie有过期时间需定期从浏览器更新
"cookie": "cdb2_sid=ZYBNN5; cdb2_uvStat=1757123386; cdb2_fuvs=25; cf_clearance=yTEYSenVbpSFycZ.Ru.XSWP8ioz0bAJiuI0LlKeJ73Q-1757123388-1.2.1.1-J2vSisVKrWSEjSMJHhXKIrgyWPzbj7s2iHLZuYkpdP8fHyqaMeq1.Q0Y4X7Lc4eSWuMz2PTn8I2pfnayrs3SdLWQ8lMTWF0.uYnOewGzf7sX9t2xkTYtD.Y99JUmkKOetkB9gi6afriLNVwgYkuE.P3nH9x8HNGW.tal8hXg.tZk8ZwwLI_4LkbX8ah4Ir7LuyjKiodZzrn8FT3aUaAy9R1iSOCIibjd.cinPjSZWrcNxDYOXH4MV5m8OM4J_iy5",
"priority": "u=0, i",
"referer": "https://sis001.com/", # Referer与第一个请求对应后续可动态调整
"sec-ch-ua": "\"Not;A=Brand\";v=\"99\", \"Microsoft Edge\";v=\"139\", \"Chromium\";v=\"139\"",
"sec-ch-ua-arch": "\"arm\"",
"sec-ch-ua-bitness": "\"64\"",
"sec-ch-ua-full-version": "\"139.0.3405.119\"",
"sec-ch-ua-full-version-list": "\"Not;A=Brand\";v=\"99.0.0.0\", \"Microsoft Edge\";v=\"139.0.3405.119\", \"Chromium\";v=\"139.0.7258.139\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": "\"\"",
"sec-ch-ua-platform": "\"macOS\"",
"sec-ch-ua-platform-version": "\"14.6.1\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
# User-Agent严格匹配浏览器避免被识别为爬虫
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0"
},
"COOKIES_ENABLED": True # 启用Cookie支持
}
def __init__(self, debug='False', begin=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
@ -45,14 +74,29 @@ class Sis001Spider(BaseSpider):
'ident' : 'forum_77'
},
]
sections2 = [
{
'plate' : 'sis_asia_yc_2',
'plate_name' : '亚有原创',
'url' : 'https://sis001.com/forum/forum-230-1.html',
'ident' : 'forum_230'
},
{
'plate' : 'sis_asia_zt_2',
'plate_name' : '亚有转帖',
'url' : 'https://sis001.com/forum/forum-58-1.html',
'ident' : 'forum_58'
},
]
for item in sections:
yield scrapy.Request(item['url'], callback=self.parse_page_common, meta=item)
yield scrapy.Request(item['url'], headers = self.custom_settings.get("DEFAULT_REQUEST_HEADERS"), callback=self.parse_page_common, meta=item)
def parse_page_common(self, response):
ident = response.meta['ident']
plate_name = response.meta['plate_name']
self.logger.debug(f"url: {response.url}, response: {response.text[:1000]}")
# 查找目标表格
tables = response.css(f'table#{ident}')
if not tables:

View File

@ -7,8 +7,9 @@ from scrapy_proj.comm.comm_def import SPIDER_NAME_U3C3
class U001Spider(BaseSpider):
name = SPIDER_NAME_U3C3
allowed_domains = ["u001.25img.com"]
start_urls = ["https://u001.25img.com/?p=1"]
allowed_domains = ["u001.25img.com", 'u9a9.com']
start_urls = ["https://u001.25img.com/?p=1", 'https://u9a9.com/?type=2&p=1']
#start_urls = ['https://u9a9.com/?type=2&p=1']
def __init__(self, debug='False', begin=None, *args, **kwargs):
super().__init__(*args, **kwargs)
@ -46,11 +47,32 @@ class U001Spider(BaseSpider):
yield item
if need_next :
# 翻页逻辑
current_page = int(response.url.split('=')[-1])
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
if current_page < total_pages:
if self.debug and current_page >= 5:
self.logger.info(f"debug mod. stop crawling.")
else:
yield response.follow(f"?p={current_page + 1}", self._parse)
if "u9a9" in response.url:
pagination_links = response.css('ul.pagination li a')
next_page_url = None
# 2. 遍历链接,检查文本是否为»(忽略前后空格)
for link in pagination_links:
# 提取a标签的文本使用get()获取单个值默认返回None
link_text = link.css('::text').get() or ''
# 去除文本前后空格,判断是否等于»
if link_text.strip() == '»':
# 提取href属性相对路径
next_page_rel = link.css('::attr(href)').get()
if next_page_rel:
# 拼接完整URL
next_page_url = response.urljoin(next_page_rel)
break # 找到后退出循环
if next_page_url:
if self.debug and 'p=5' in next_page_url:
self.logger.info(f"debug mod. stop crawling.")
else:
yield response.follow(next_page_url, self._parse)
else:
# 翻页逻辑
current_page = int(response.url.split('=')[-1])
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
if current_page < total_pages:
if self.debug and current_page >= 5:
self.logger.info(f"debug mod. stop crawling.")
else:
yield response.follow(f"?p={current_page + 1}", self._parse)