modify scripts
This commit is contained in:
@ -1,3 +1,4 @@
|
|||||||
|
from datetime import datetime
|
||||||
import scrapy
|
import scrapy
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||||
@ -110,7 +111,8 @@ class Sis001Spider(BaseSpider):
|
|||||||
|
|
||||||
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
||||||
up_date = parse_date_to_datetime(item['update_date'])
|
up_date = parse_date_to_datetime(item['update_date'])
|
||||||
if up_date and self.begin and up_date < self.begin :
|
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||||||
|
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()) :
|
||||||
self.logger.debug(f"find early data.")
|
self.logger.debug(f"find early data.")
|
||||||
else:
|
else:
|
||||||
need_next = True
|
need_next = True
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
from datetime import datetime
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||||
from scrapy_proj.items import U001Item
|
from scrapy_proj.items import U001Item
|
||||||
@ -11,7 +12,7 @@ class U001Spider(BaseSpider):
|
|||||||
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||||
self.logger.info(f"debug mod: {self.debug}")
|
self.logger.info(f"debug mod: {self.debug}, begin: {begin}")
|
||||||
|
|
||||||
self.begin = parse_date_to_datetime(begin) if begin else None
|
self.begin = parse_date_to_datetime(begin) if begin else None
|
||||||
|
|
||||||
@ -36,7 +37,8 @@ class U001Spider(BaseSpider):
|
|||||||
|
|
||||||
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
||||||
up_date = parse_date_to_datetime(item['update_date'])
|
up_date = parse_date_to_datetime(item['update_date'])
|
||||||
if up_date and self.begin and up_date < self.begin :
|
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||||||
|
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
|
||||||
self.logger.debug(f"find early data.")
|
self.logger.debug(f"find early data.")
|
||||||
else:
|
else:
|
||||||
need_next = True
|
need_next = True
|
||||||
|
|||||||
Reference in New Issue
Block a user