modify scripts
This commit is contained in:
@ -1,8 +1,11 @@
|
||||
scrapy crawl clm -a mod='update' -a begin='2025-07-10' -s STATS_EXPORT_INTERVAL=180 -a query_str="groups='actress' and tags like '%vixen%' "
|
||||
scrapy crawl clm -a mod='update' -a begin='2025-07-10' -s STATS_PUSH_MSG=False
|
||||
scrapy crawl clm -a mod='reload' -s STATS_PUSH_MSG=False -a file_path=./scrapy_proj/data/clm_keywords.json
|
||||
scrapy crawl clm -a begin='2025-07-15' -a mod='update' -a query_str="groups not like '%actress%'" -s STATS_PUSH_MSG=False
|
||||
scrapy crawl clm -a mod='all' -a query_str="groups not like '%actress%'" -s STATS_PUSH_MSG=False
|
||||
|
||||
scrapy crawl u3c3 -a begin='2025-07-04' end='2024-07-12'
|
||||
scrapy crawl u3c3 -a begin='2025-07-04' -a end='2024-07-12'
|
||||
scrapy crawl u3c3 -a begin='2023-08-10' -a end='2024-10-15' -s STATS_PUSH_MSG=False
|
||||
|
||||
scrapy crawl pbox -a mod='update' -a begin='2025-07-16'
|
||||
scrapy crawl pbox -a debug=1 -a cmd='studio,movies'
|
||||
|
||||
@ -415,6 +415,10 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类
|
||||
else:
|
||||
logging.warning(f"不支持的条件类型: {condition_type},键: {key}")
|
||||
|
||||
# 处理直接传入的 query_str
|
||||
if "query_str" in filters:
|
||||
sql += f" AND ({filters['query_str']})"
|
||||
|
||||
# 处理排序(基于校验后的valid_order_fields)
|
||||
if "order_by" in valid_filters:
|
||||
sql += f" ORDER BY {valid_filters['order_by']}"
|
||||
|
||||
@ -10,6 +10,35 @@ class Sis001Spider(BaseSpider):
|
||||
name = SPIDER_NAME_SIS
|
||||
allowed_domains = ["sis001.com"]
|
||||
|
||||
# 配置请求头(复用curl中的头部信息)
|
||||
custom_settings = {
|
||||
"DEFAULT_REQUEST_HEADERS": {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||||
# Cookie直接复用curl中的值(注意:Cookie有过期时间,需定期从浏览器更新)
|
||||
"cookie": "cdb2_sid=ZYBNN5; cdb2_uvStat=1757123386; cdb2_fuvs=25; cf_clearance=yTEYSenVbpSFycZ.Ru.XSWP8ioz0bAJiuI0LlKeJ73Q-1757123388-1.2.1.1-J2vSisVKrWSEjSMJHhXKIrgyWPzbj7s2iHLZuYkpdP8fHyqaMeq1.Q0Y4X7Lc4eSWuMz2PTn8I2pfnayrs3SdLWQ8lMTWF0.uYnOewGzf7sX9t2xkTYtD.Y99JUmkKOetkB9gi6afriLNVwgYkuE.P3nH9x8HNGW.tal8hXg.tZk8ZwwLI_4LkbX8ah4Ir7LuyjKiodZzrn8FT3aUaAy9R1iSOCIibjd.cinPjSZWrcNxDYOXH4MV5m8OM4J_iy5",
|
||||
"priority": "u=0, i",
|
||||
"referer": "https://sis001.com/", # Referer与第一个请求对应,后续可动态调整
|
||||
"sec-ch-ua": "\"Not;A=Brand\";v=\"99\", \"Microsoft Edge\";v=\"139\", \"Chromium\";v=\"139\"",
|
||||
"sec-ch-ua-arch": "\"arm\"",
|
||||
"sec-ch-ua-bitness": "\"64\"",
|
||||
"sec-ch-ua-full-version": "\"139.0.3405.119\"",
|
||||
"sec-ch-ua-full-version-list": "\"Not;A=Brand\";v=\"99.0.0.0\", \"Microsoft Edge\";v=\"139.0.3405.119\", \"Chromium\";v=\"139.0.7258.139\"",
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-model": "\"\"",
|
||||
"sec-ch-ua-platform": "\"macOS\"",
|
||||
"sec-ch-ua-platform-version": "\"14.6.1\"",
|
||||
"sec-fetch-dest": "document",
|
||||
"sec-fetch-mode": "navigate",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-user": "?1",
|
||||
"upgrade-insecure-requests": "1",
|
||||
# User-Agent严格匹配浏览器,避免被识别为爬虫
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0"
|
||||
},
|
||||
"COOKIES_ENABLED": True # 启用Cookie支持
|
||||
}
|
||||
|
||||
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||
@ -45,14 +74,29 @@ class Sis001Spider(BaseSpider):
|
||||
'ident' : 'forum_77'
|
||||
},
|
||||
]
|
||||
sections2 = [
|
||||
{
|
||||
'plate' : 'sis_asia_yc_2',
|
||||
'plate_name' : '亚有原创',
|
||||
'url' : 'https://sis001.com/forum/forum-230-1.html',
|
||||
'ident' : 'forum_230'
|
||||
},
|
||||
{
|
||||
'plate' : 'sis_asia_zt_2',
|
||||
'plate_name' : '亚有转帖',
|
||||
'url' : 'https://sis001.com/forum/forum-58-1.html',
|
||||
'ident' : 'forum_58'
|
||||
},
|
||||
]
|
||||
|
||||
for item in sections:
|
||||
yield scrapy.Request(item['url'], callback=self.parse_page_common, meta=item)
|
||||
yield scrapy.Request(item['url'], headers = self.custom_settings.get("DEFAULT_REQUEST_HEADERS"), callback=self.parse_page_common, meta=item)
|
||||
|
||||
|
||||
def parse_page_common(self, response):
|
||||
ident = response.meta['ident']
|
||||
plate_name = response.meta['plate_name']
|
||||
self.logger.debug(f"url: {response.url}, response: {response.text[:1000]}")
|
||||
# 查找目标表格
|
||||
tables = response.css(f'table#{ident}')
|
||||
if not tables:
|
||||
|
||||
@ -7,8 +7,9 @@ from scrapy_proj.comm.comm_def import SPIDER_NAME_U3C3
|
||||
|
||||
class U001Spider(BaseSpider):
|
||||
name = SPIDER_NAME_U3C3
|
||||
allowed_domains = ["u001.25img.com"]
|
||||
start_urls = ["https://u001.25img.com/?p=1"]
|
||||
allowed_domains = ["u001.25img.com", 'u9a9.com']
|
||||
start_urls = ["https://u001.25img.com/?p=1", 'https://u9a9.com/?type=2&p=1']
|
||||
#start_urls = ['https://u9a9.com/?type=2&p=1']
|
||||
|
||||
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@ -46,11 +47,32 @@ class U001Spider(BaseSpider):
|
||||
yield item
|
||||
|
||||
if need_next :
|
||||
# 翻页逻辑
|
||||
current_page = int(response.url.split('=')[-1])
|
||||
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
|
||||
if current_page < total_pages:
|
||||
if self.debug and current_page >= 5:
|
||||
self.logger.info(f"debug mod. stop crawling.")
|
||||
else:
|
||||
yield response.follow(f"?p={current_page + 1}", self._parse)
|
||||
if "u9a9" in response.url:
|
||||
pagination_links = response.css('ul.pagination li a')
|
||||
next_page_url = None
|
||||
# 2. 遍历链接,检查文本是否为»(忽略前后空格)
|
||||
for link in pagination_links:
|
||||
# 提取a标签的文本(使用get()获取单个值,默认返回None)
|
||||
link_text = link.css('::text').get() or ''
|
||||
# 去除文本前后空格,判断是否等于»
|
||||
if link_text.strip() == '»':
|
||||
# 提取href属性(相对路径)
|
||||
next_page_rel = link.css('::attr(href)').get()
|
||||
if next_page_rel:
|
||||
# 拼接完整URL
|
||||
next_page_url = response.urljoin(next_page_rel)
|
||||
break # 找到后退出循环
|
||||
if next_page_url:
|
||||
if self.debug and 'p=5' in next_page_url:
|
||||
self.logger.info(f"debug mod. stop crawling.")
|
||||
else:
|
||||
yield response.follow(next_page_url, self._parse)
|
||||
else:
|
||||
# 翻页逻辑
|
||||
current_page = int(response.url.split('=')[-1])
|
||||
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
|
||||
if current_page < total_pages:
|
||||
if self.debug and current_page >= 5:
|
||||
self.logger.info(f"debug mod. stop crawling.")
|
||||
else:
|
||||
yield response.follow(f"?p={current_page + 1}", self._parse)
|
||||
Reference in New Issue
Block a user