From 95b4d8b414cd80b166e1dc36587faff9d64be91b Mon Sep 17 00:00:00 2001 From: oscarz Date: Sat, 5 Jul 2025 12:52:28 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/scrapy_proj/spiders/sis_spider.py | 4 +++- scrapy_proj/scrapy_proj/spiders/u3c3_spider.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/scrapy_proj/scrapy_proj/spiders/sis_spider.py b/scrapy_proj/scrapy_proj/spiders/sis_spider.py index 34c5848..d019102 100644 --- a/scrapy_proj/scrapy_proj/spiders/sis_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/sis_spider.py @@ -1,3 +1,4 @@ +from datetime import datetime import scrapy from urllib.parse import urljoin from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element @@ -110,7 +111,8 @@ class Sis001Spider(BaseSpider): # 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页 up_date = parse_date_to_datetime(item['update_date']) - if up_date and self.begin and up_date < self.begin : + self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}") + if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()) : self.logger.debug(f"find early data.") else: need_next = True diff --git a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py index 6aae015..7268032 100644 --- a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py @@ -1,3 +1,4 @@ +from datetime import datetime import scrapy from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element from scrapy_proj.items import U001Item @@ -11,7 +12,7 @@ class U001Spider(BaseSpider): def __init__(self, debug='False', begin=None, *args, **kwargs): super().__init__(*args, **kwargs) self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False - self.logger.info(f"debug mod: {self.debug}") + self.logger.info(f"debug mod: {self.debug}, begin: {begin}") self.begin = parse_date_to_datetime(begin) if begin else None @@ -36,7 +37,8 @@ class U001Spider(BaseSpider): # 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页 up_date = parse_date_to_datetime(item['update_date']) - if up_date and self.begin and up_date < self.begin : + self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}") + if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()): self.logger.debug(f"find early data.") else: need_next = True