modify scripts
This commit is contained in:
@ -18,7 +18,7 @@ class FailureMonitorExtension:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def from_crawler(cls, crawler):
|
def from_crawler(cls, crawler):
|
||||||
# 从设置中获取参数
|
# 从设置中获取参数
|
||||||
max_consecutive = crawler.settings.getint('EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES', 10)
|
max_consecutive = crawler.settings.getint('EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES', 100)
|
||||||
failure_rate = crawler.settings.getfloat('EXT_FAIL_MONI_RATE_THRESHOLD', 0.5)
|
failure_rate = crawler.settings.getfloat('EXT_FAIL_MONI_RATE_THRESHOLD', 0.5)
|
||||||
time_window = crawler.settings.getint('EXT_FAIL_MONI_FAILURE_TIME_WINDOW', 60)
|
time_window = crawler.settings.getint('EXT_FAIL_MONI_FAILURE_TIME_WINDOW', 60)
|
||||||
|
|
||||||
|
|||||||
@ -13,6 +13,7 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import scrapy
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
|
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
|
||||||
@ -75,24 +76,24 @@ class SQLitePipeline(SQLiteDBHandler):
|
|||||||
return item
|
return item
|
||||||
|
|
||||||
def _process_u001_item(self, item, spider):
|
def _process_u001_item(self, item, spider):
|
||||||
logging.debug(f"insert one item. href:{spider.name}")
|
spider.logger.debug(f"insert one item. href:{spider.name}")
|
||||||
return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
|
return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
|
||||||
|
|
||||||
def _process_sis001_item(self, item, spider):
|
def _process_sis001_item(self, item, spider):
|
||||||
logging.debug(f"insert one item. href:{spider.name}")
|
spider.logger.debug(f"insert one item. href:{spider.name}")
|
||||||
return self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
|
return self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
|
||||||
|
|
||||||
def _process_iafd_person_item(self, item, spider):
|
def _process_iafd_person_item(self, item, spider):
|
||||||
logging.info(f"deal with persion item. {item}")
|
spider.logger.debug(f"deal with persion item. {item}")
|
||||||
|
|
||||||
def _process_iafd_movie_item(self, item, spider):
|
def _process_iafd_movie_item(self, item, spider):
|
||||||
logging.info(f"deal with movie item. {item}")
|
spider.logger.debug(f"deal with movie item. {item}")
|
||||||
|
|
||||||
def _process_iafd_person_detail_item(self, item, spider):
|
def _process_iafd_person_detail_item(self, item, spider):
|
||||||
logging.info(f"deal with persion item. {item}")
|
spider.logger.debug(f"deal with persion item. {item}")
|
||||||
|
|
||||||
def _process_iafd_movie_detail_item(self, item, spider):
|
def _process_iafd_movie_detail_item(self, item, spider):
|
||||||
logging.info(f"deal with movie item. {item}")
|
spider.logger.debug(f"deal with movie item. {item}")
|
||||||
|
|
||||||
def close_spider(self, spider):
|
def close_spider(self, spider):
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
@ -29,9 +29,9 @@ NEWSPIDER_MODULE = "scrapy_proj.spiders"
|
|||||||
ADDONS = {}
|
ADDONS = {}
|
||||||
|
|
||||||
# 并发设置
|
# 并发设置
|
||||||
CONCURRENT_REQUESTS = 1
|
CONCURRENT_REQUESTS = 10
|
||||||
CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
CONCURRENT_REQUESTS_PER_DOMAIN = 5
|
||||||
CONCURRENT_ITEMS = 100
|
CONCURRENT_ITEMS = 1000
|
||||||
|
|
||||||
# 下载延迟
|
# 下载延迟
|
||||||
DOWNLOAD_DELAY = 0.3
|
DOWNLOAD_DELAY = 0.3
|
||||||
|
|||||||
@ -44,7 +44,7 @@ class BaseSpider(scrapy.Spider):
|
|||||||
self.logger.warning(f"页面被拦截: {response.url}, 原因: {reason}")
|
self.logger.warning(f"页面被拦截: {response.url}, 原因: {reason}")
|
||||||
return self.handle_blocked(response, reason)
|
return self.handle_blocked(response, reason)
|
||||||
|
|
||||||
'''
|
|
||||||
# 确定实际的解析方法
|
# 确定实际的解析方法
|
||||||
callback = self._get_callback(response)
|
callback = self._get_callback(response)
|
||||||
if callback:
|
if callback:
|
||||||
@ -52,8 +52,8 @@ class BaseSpider(scrapy.Spider):
|
|||||||
else:
|
else:
|
||||||
# 如果没有指定回调,尝试使用默认的_parse方法
|
# 如果没有指定回调,尝试使用默认的_parse方法
|
||||||
yield from self._parse(response)
|
yield from self._parse(response)
|
||||||
'''
|
|
||||||
yield from self._parse(response)
|
#yield from self._parse(response)
|
||||||
|
|
||||||
def _get_callback(self, response):
|
def _get_callback(self, response):
|
||||||
"""获取请求的回调方法"""
|
"""获取请求的回调方法"""
|
||||||
@ -154,3 +154,25 @@ class BaseSpider(scrapy.Spider):
|
|||||||
dont_filter=True,
|
dont_filter=True,
|
||||||
priority=response.request.priority - 1
|
priority=response.request.priority - 1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
"""提取页面元素中的文本内容, 如果包含子元素, 则拼接所有内容。优先使用非空title属性"""
|
||||||
|
def extract_text_from_element(element, use_title=False):
|
||||||
|
# 检查title属性是否存在且不为空字符串
|
||||||
|
if use_title:
|
||||||
|
title_attr = element.attrib.get('title', '').strip()
|
||||||
|
if title_attr:
|
||||||
|
return title_attr
|
||||||
|
|
||||||
|
# 否则使用XPath的string(.)函数获取所有子孙节点的文本
|
||||||
|
full_text = element.xpath('string(.)').get(default='').strip()
|
||||||
|
|
||||||
|
# 如果结果为空,尝试获取所有文本片段并分别strip后合并
|
||||||
|
if not full_text:
|
||||||
|
text_parts = element.css('::text').getall()
|
||||||
|
# 对每个文本片段进行strip处理
|
||||||
|
stripped_parts = [part.strip() for part in text_parts]
|
||||||
|
# 过滤掉空字符串并拼接
|
||||||
|
full_text = ' '.join(filter(None, stripped_parts))
|
||||||
|
|
||||||
|
return full_text or '无标题' # 确保至少返回"无标题"
|
||||||
@ -206,7 +206,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
item['from_dist_list'] = 1
|
item['from_dist_list'] = 1
|
||||||
item['from_stu_list'] = 0
|
item['from_stu_list'] = 0
|
||||||
yield item
|
yield item
|
||||||
yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
|
#yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
|
||||||
|
|
||||||
def parse_studios_list_page(self, response):
|
def parse_studios_list_page(self, response):
|
||||||
select_element = response.css('select[name="Studio"]')
|
select_element = response.css('select[name="Studio"]')
|
||||||
@ -224,7 +224,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
item['from_dist_list'] = 0
|
item['from_dist_list'] = 0
|
||||||
item['from_stu_list'] = 1
|
item['from_stu_list'] = 1
|
||||||
yield item
|
yield item
|
||||||
yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
|
#yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
|
||||||
|
|
||||||
def parse_person_detail_page(self, response):
|
def parse_person_detail_page(self, response):
|
||||||
item = IAFDPersonDetailItem()
|
item = IAFDPersonDetailItem()
|
||||||
|
|||||||
@ -1,39 +1,20 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
from scrapy_proj.spiders.base_spider import BaseSpider
|
|
||||||
from scrapy_proj.items import Sis001Item
|
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
import re
|
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||||
|
from scrapy_proj.items import Sis001Item
|
||||||
|
from scrapy_proj.utils.utils import parse_size_format, parse_date_to_datetime
|
||||||
def extract_title(element):
|
|
||||||
"""提取a标签中的文本内容,优先使用非空title属性"""
|
|
||||||
# 检查title属性是否存在且不为空字符串
|
|
||||||
title_attr = element.attrib.get('title', '').strip()
|
|
||||||
if title_attr:
|
|
||||||
return title_attr
|
|
||||||
|
|
||||||
# 否则使用XPath的string(.)函数获取所有子孙节点的文本
|
|
||||||
full_text = element.xpath('string(.)').get(default='').strip()
|
|
||||||
|
|
||||||
# 如果结果为空,尝试获取所有文本片段并分别strip后合并
|
|
||||||
if not full_text:
|
|
||||||
text_parts = element.css('::text').getall()
|
|
||||||
# 对每个文本片段进行strip处理
|
|
||||||
stripped_parts = [part.strip() for part in text_parts]
|
|
||||||
# 过滤掉空字符串并拼接
|
|
||||||
full_text = ' '.join(filter(None, stripped_parts))
|
|
||||||
|
|
||||||
return full_text or '无标题' # 确保至少返回"无标题"
|
|
||||||
|
|
||||||
class Sis001Spider(BaseSpider):
|
class Sis001Spider(BaseSpider):
|
||||||
name = "sis"
|
name = "sis"
|
||||||
allowed_domains = ["sis001.com"]
|
allowed_domains = ["sis001.com"]
|
||||||
|
|
||||||
def __init__(self, debug='False', *args, **kwargs):
|
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||||
self.logger.info(f"debug mod: {self.debug}")
|
self.logger.info(f"debug mod: {self.debug}")
|
||||||
|
|
||||||
|
self.begin = parse_date_to_datetime(begin) if begin else None
|
||||||
|
|
||||||
# 入口函数,由基类的方法触发
|
# 入口函数,由基类的方法触发
|
||||||
def custom_start_requests(self):
|
def custom_start_requests(self):
|
||||||
sections = [
|
sections = [
|
||||||
@ -73,33 +54,34 @@ class Sis001Spider(BaseSpider):
|
|||||||
# 查找目标表格
|
# 查找目标表格
|
||||||
tables = response.css(f'table#{ident}')
|
tables = response.css(f'table#{ident}')
|
||||||
if not tables:
|
if not tables:
|
||||||
self.logger.warning(f"cannot found table. url: {response.url}")
|
self.logger.error(f"cannot found table. url: {response.url}")
|
||||||
return
|
return
|
||||||
|
|
||||||
main_table = None
|
main_table = None
|
||||||
for table in tables:
|
for table in tables:
|
||||||
# 检查表头是否包含"版块主题"
|
# 检查表头是否包含"版块主题"
|
||||||
tbody_tile = extract_title(table.css('thead'))
|
tbody_tile = extract_text_from_element(table.css('thead'))
|
||||||
if "版块主题" in tbody_tile:
|
if "版块主题" in tbody_tile:
|
||||||
main_table = table
|
main_table = table
|
||||||
break
|
break
|
||||||
|
|
||||||
if not main_table:
|
if not main_table:
|
||||||
self.logger.warning(f"cannot found table in right topic. url: {response.url}")
|
self.logger.error(f"cannot found table in right topic. url: {response.url}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
need_next = False
|
||||||
# 解析表格行数据
|
# 解析表格行数据
|
||||||
for body in main_table.css('tbody[id^="normalthread_"]'):
|
for body in main_table.css('tbody[id^="normalthread_"]'):
|
||||||
for row in body.css('tr'):
|
for row in body.css('tr'):
|
||||||
tds = row.css('td')
|
tds = row.css('td')
|
||||||
if len(tds) < 6:
|
if len(tds) < 6:
|
||||||
self.logger.warning(f"跳过不完整的行,列数: {len(tds)}")
|
self.logger.warning(f"跳过不完整的行,列数: {len(tds)}. url: {response.url}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 解析类别和标题
|
# 解析类别和标题
|
||||||
th_lock = row.css('th')
|
th_lock = row.css('th')
|
||||||
if not th_lock:
|
if not th_lock:
|
||||||
self.logger.warning("未找到th.lock元素")
|
self.logger.warning(f"未找到th.lock元素. url: {response.url}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 解析类别链接
|
# 解析类别链接
|
||||||
@ -115,7 +97,7 @@ class Sis001Spider(BaseSpider):
|
|||||||
|
|
||||||
# 解析大小和格式
|
# 解析大小和格式
|
||||||
size_text = tds[4].css('::text').get(default="").strip()
|
size_text = tds[4].css('::text').get(default="").strip()
|
||||||
size_gb, file_format = self.parse_size_format(size_text)
|
size_gb, file_format = parse_size_format(size_text)
|
||||||
|
|
||||||
# 生成数据项
|
# 生成数据项
|
||||||
item = Sis001Item()
|
item = Sis001Item()
|
||||||
@ -125,45 +107,22 @@ class Sis001Spider(BaseSpider):
|
|||||||
item['size_text'] = size_text
|
item['size_text'] = size_text
|
||||||
item['size_gb'] = size_gb
|
item['size_gb'] = size_gb
|
||||||
item['update_date'] = pub_date
|
item['update_date'] = pub_date
|
||||||
|
|
||||||
|
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
||||||
|
up_date = parse_date_to_datetime(item['update_date'])
|
||||||
|
if up_date and self.begin and up_date < self.begin :
|
||||||
|
self.logger.debug(f"find early data.")
|
||||||
|
else:
|
||||||
|
need_next = True
|
||||||
|
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
# 处理分页
|
if need_next:
|
||||||
next_url = response.css('div.pages_btns a.next::attr(href)').get()
|
# 处理分页
|
||||||
if next_url:
|
next_url = response.css('div.pages_btns a.next::attr(href)').get()
|
||||||
next_url = urljoin(response.url, next_url)
|
if next_url:
|
||||||
self.logger.info(f"发现下一页: {next_url}")
|
next_url = urljoin(response.url, next_url)
|
||||||
if not self.debug:
|
self.logger.debug(f"发现下一页: {next_url}")
|
||||||
yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
|
if not self.debug:
|
||||||
|
yield scrapy.Request(next_url, callback=self.parse_page_common, meta=response.meta)
|
||||||
def parse_size_format(self, size_text: str):
|
|
||||||
"""解析大小和格式(保持原有逻辑不变)"""
|
|
||||||
try:
|
|
||||||
if not size_text:
|
|
||||||
return 0.0, "未知格式"
|
|
||||||
|
|
||||||
# 分割大小和格式
|
|
||||||
parts = size_text.split('/')
|
|
||||||
format_part = parts[1].strip() if len(parts) > 1 else "未知格式"
|
|
||||||
|
|
||||||
# 解析大小
|
|
||||||
size_part = parts[0].strip()
|
|
||||||
match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_part)
|
|
||||||
|
|
||||||
if not match:
|
|
||||||
self.logger.warning(f"无法解析大小: {size_part}")
|
|
||||||
return 0.0, format_part
|
|
||||||
|
|
||||||
value, unit = match.groups()
|
|
||||||
value = float(value)
|
|
||||||
|
|
||||||
if unit.lower() == 'mb' or unit.lower() == 'm':
|
|
||||||
return round(value / 1024, 2), format_part
|
|
||||||
elif unit.lower() == 'gb' or unit.lower() == 'g':
|
|
||||||
return round(value, 2), format_part
|
|
||||||
else:
|
|
||||||
self.logger.warning(f"未知单位: {unit}")
|
|
||||||
return 0.0, format_part
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"解析大小格式时出错: {e}")
|
|
||||||
return 0.0, "未知格式"
|
|
||||||
@ -1,44 +1,27 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
from scrapy_proj.spiders.base_spider import BaseSpider
|
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||||
from scrapy_proj.items import U001Item
|
from scrapy_proj.items import U001Item
|
||||||
from scrapy_proj.utils.size_converter import parse_size
|
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
|
||||||
|
|
||||||
def extract_title(element):
|
|
||||||
"""提取a标签中的文本内容,优先使用非空title属性"""
|
|
||||||
# 检查title属性是否存在且不为空字符串
|
|
||||||
title_attr = element.attrib.get('title', '').strip()
|
|
||||||
if title_attr:
|
|
||||||
return title_attr
|
|
||||||
|
|
||||||
# 否则使用XPath的string(.)函数获取所有子孙节点的文本
|
|
||||||
full_text = element.xpath('string(.)').get(default='').strip()
|
|
||||||
|
|
||||||
# 如果结果为空,尝试获取所有文本片段并分别strip后合并
|
|
||||||
if not full_text:
|
|
||||||
text_parts = element.css('::text').getall()
|
|
||||||
# 对每个文本片段进行strip处理
|
|
||||||
stripped_parts = [part.strip() for part in text_parts]
|
|
||||||
# 过滤掉空字符串并拼接
|
|
||||||
full_text = ' '.join(filter(None, stripped_parts))
|
|
||||||
|
|
||||||
return full_text or '无标题' # 确保至少返回"无标题"
|
|
||||||
|
|
||||||
class U001Spider(BaseSpider):
|
class U001Spider(BaseSpider):
|
||||||
name = "u3c3"
|
name = "u3c3"
|
||||||
allowed_domains = ["u001.25img.com"]
|
allowed_domains = ["u001.25img.com"]
|
||||||
start_urls = ["https://u001.25img.com/?p=1"]
|
start_urls = ["https://u001.25img.com/?p=1"]
|
||||||
|
|
||||||
def __init__(self, debug='False', *args, **kwargs):
|
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||||
self.logger.info(f"debug mod: {self.debug}")
|
self.logger.info(f"debug mod: {self.debug}")
|
||||||
|
|
||||||
|
self.begin = parse_date_to_datetime(begin) if begin else None
|
||||||
|
|
||||||
# 入口函数,由基类的方法触发
|
# 入口函数,由基类的方法触发
|
||||||
def _parse(self, response):
|
def _parse(self, response):
|
||||||
|
need_next = False
|
||||||
for row in response.css('table.torrent-list tbody tr'):
|
for row in response.css('table.torrent-list tbody tr'):
|
||||||
item = U001Item()
|
item = U001Item()
|
||||||
item['category'] = row.css('td:nth-child(1) a::attr(title)').get()
|
item['category'] = row.css('td:nth-child(1) a::attr(title)').get()
|
||||||
item['title'] = extract_title(row.css('td:nth-child(2) a'))
|
item['title'] = extract_text_from_element(row.css('td:nth-child(2) a'), use_title=True)
|
||||||
item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get())
|
item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get())
|
||||||
|
|
||||||
links = row.css('td:nth-child(3) a::attr(href)').getall()
|
links = row.css('td:nth-child(3) a::attr(href)').getall()
|
||||||
@ -50,13 +33,21 @@ class U001Spider(BaseSpider):
|
|||||||
item['size_gb'] = parse_size(size_text)
|
item['size_gb'] = parse_size(size_text)
|
||||||
|
|
||||||
item['update_date'] = row.css('td:nth-child(5)::text').get(default='').strip()
|
item['update_date'] = row.css('td:nth-child(5)::text').get(default='').strip()
|
||||||
|
|
||||||
|
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
||||||
|
up_date = parse_date_to_datetime(item['update_date'])
|
||||||
|
if up_date and self.begin and up_date < self.begin :
|
||||||
|
self.logger.debug(f"find early data.")
|
||||||
|
else:
|
||||||
|
need_next = True
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
# 翻页逻辑
|
if need_next :
|
||||||
current_page = int(response.url.split('=')[-1])
|
# 翻页逻辑
|
||||||
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
|
current_page = int(response.url.split('=')[-1])
|
||||||
if current_page < total_pages:
|
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
|
||||||
if self.debug and current_page >= 5:
|
if current_page < total_pages:
|
||||||
self.logger.info(f"debug mod. stop crawling.")
|
if self.debug and current_page >= 5:
|
||||||
else:
|
self.logger.info(f"debug mod. stop crawling.")
|
||||||
yield response.follow(f"?p={current_page + 1}", self.parse)
|
else:
|
||||||
|
yield response.follow(f"?p={current_page + 1}", self._parse)
|
||||||
@ -1,19 +0,0 @@
|
|||||||
import re
|
|
||||||
|
|
||||||
def parse_size(size_text):
|
|
||||||
try:
|
|
||||||
match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_text)
|
|
||||||
if not match:
|
|
||||||
return 0.0
|
|
||||||
value, unit = match.groups()
|
|
||||||
value = float(value)
|
|
||||||
if unit.lower() == 'mb':
|
|
||||||
return round(value / 1024, 2)
|
|
||||||
elif unit.lower() == 'kb':
|
|
||||||
return round(value / 1024 / 1024, 2)
|
|
||||||
elif unit.lower() == 'gb':
|
|
||||||
return round(value, 2)
|
|
||||||
else:
|
|
||||||
return 0.0
|
|
||||||
except Exception:
|
|
||||||
return 0.0
|
|
||||||
80
scrapy_proj/scrapy_proj/utils/utils.py
Normal file
80
scrapy_proj/scrapy_proj/utils/utils.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
''' 解析格式为 xxxMB, xxxGB, xxxM 等格式的字符串, 统一单位为 gb '''
|
||||||
|
def parse_size(size_text):
|
||||||
|
try:
|
||||||
|
match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_text)
|
||||||
|
if not match:
|
||||||
|
return 0.0
|
||||||
|
value, unit = match.groups()
|
||||||
|
value = float(value)
|
||||||
|
if unit.lower() == 'mb' or unit.lower() == 'm':
|
||||||
|
return round(value / 1024, 2)
|
||||||
|
elif unit.lower() == 'kb' or unit.lower() == 'k':
|
||||||
|
return round(value / 1024 / 1024, 2)
|
||||||
|
elif unit.lower() == 'gb' or unit.lower() == 'g':
|
||||||
|
return round(value, 2)
|
||||||
|
else:
|
||||||
|
return 0.0
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
''' 解析格式为 5GB/MP4 的字符串, 提取视频类型和统一转换大小。 '''
|
||||||
|
def parse_size_format(size_text: str):
|
||||||
|
try:
|
||||||
|
if not size_text:
|
||||||
|
return 0.0, "未知格式"
|
||||||
|
|
||||||
|
# 分割大小和格式
|
||||||
|
parts = size_text.split('/')
|
||||||
|
format_part = parts[1].strip() if len(parts) > 1 else "未知格式"
|
||||||
|
|
||||||
|
# 解析大小
|
||||||
|
return parse_size( parts[0].strip() ), format_part
|
||||||
|
|
||||||
|
size_part = parts[0].strip()
|
||||||
|
match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_part)
|
||||||
|
|
||||||
|
if not match:
|
||||||
|
return 0.0, format_part
|
||||||
|
|
||||||
|
value, unit = match.groups()
|
||||||
|
value = float(value)
|
||||||
|
|
||||||
|
if unit.lower() == 'mb' or unit.lower() == 'm':
|
||||||
|
return round(value / 1024, 2), format_part
|
||||||
|
elif unit.lower() == 'gb' or unit.lower() == 'g':
|
||||||
|
return round(value, 2), format_part
|
||||||
|
else:
|
||||||
|
return 0.0, format_part
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return 0.0, "未知格式"
|
||||||
|
|
||||||
|
|
||||||
|
"""将日期字符串转换为datetime对象, 支持多种格式"""
|
||||||
|
def parse_date_to_datetime(date_str):
|
||||||
|
# 处理可能的日期格式:
|
||||||
|
# 1. yyyy-mm-dd
|
||||||
|
# 2. yyyy-m-d
|
||||||
|
# 3. yyyy/mm/dd
|
||||||
|
# 4. yyyy/m/dd
|
||||||
|
# 5. yyyy年mm月dd日(中文格式)
|
||||||
|
# 6. yyyy-mm-dd hh:mm:ss
|
||||||
|
|
||||||
|
# 尝试匹配不同格式
|
||||||
|
patterns = [
|
||||||
|
(r'^(\d{4})-(\d{1,2})-(\d{1,2})$', "%Y-%m-%d"), # yyyy-mm-dd 或 yyyy-m-d
|
||||||
|
(r'^(\d{4})/(\d{1,2})/(\d{1,2})$', "%Y/%m/%d"), # yyyy/mm/dd 或 yyyy/m/dd
|
||||||
|
(r'^(\d{4})年(\d{1,2})月(\d{1,2})日$', "%Y年%m月%d日"), # 中文格式
|
||||||
|
(r'^(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2}):(\d{1,2}):(\d{1,2})$', "%Y-%m-%d %H:%M:%S"), # 带时间
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern, format_str in patterns:
|
||||||
|
match = re.match(pattern, date_str)
|
||||||
|
if match:
|
||||||
|
return datetime.strptime(date_str, format_str)
|
||||||
|
|
||||||
|
# 如果所有格式都不匹配,抛出错误
|
||||||
|
return None
|
||||||
Reference in New Issue
Block a user