Husky Spider Utils Help

以观书法

以观书法的汉字图片爬取

核心代码

import os import time from husky_spider_utils import SeleniumSession from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.actions.wheel_input import ScrollOrigin from parsel import Selector class YGSF: url = "https://web.ygsf.com/" max_image = 360 min_sleep_time = 2 max_sleep_time = 3 def __init__(self, max_attempts=20, output_folder="./output", browser="edge"): self.output_folder = output_folder self.session = SeleniumSession( selenium_init_url=self.url, driver_type=browser) self.max_attempts = max_attempts self.init() def init(self): if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) def search(self, text): self.session.selenium_get(self.url) self.session.try_click(".u-search") time.sleep(1) self.session.try_send_key("input[type='search']", text) time.sleep(1) self.session.try_click(".van-search__action .u-nav-icon") ui_type = self.ui_type() for index in range(len(ui_type)): if index != 0: self.session.try_click(f'//*[@id="app"]/div/div[1]/div/div[1]/ul/li[{index + 1}]', by=By.XPATH) ui = ui_type[index] # self.session.sleep_random_time() time.sleep(3) self.get_images_urls(ui_type=ui, text=text) def get_images_urls(self, ui_type="", text=""): images = [] cnt = 0 current_attempt = 0 while True: drag = self.session.driver.find_element(by=By.CSS_SELECTOR, value=".m-grid__box") ActionChains(self.session.driver).scroll_from_origin(ScrollOrigin(drag, 200, 300), 0, 1000).perform() document = Selector(text=self.session.driver.page_source) images.extend(document.css(".m-image::attr(data-src)").getall()) if cnt == len(set(images)): current_attempt += 1 if current_attempt >= self.max_attempts: break cnt = len(set(images)) if cnt >= self.max_image: break # login_tip = self.session.driver.find_element(by=By.CSS_SELECTOR, value=".van-dialog__header") # # if login_tip.is_displayed(): # self.session.try_click(".van-dialog__cancel") # break self.session.sleep_random_time(self.min_sleep_time, self.max_sleep_time) images = set(images) self.save(images, f"{text}-{ui_type}") def ui_type(self): document = Selector(text=self.session.driver.page_source) u_types = document.css(".m-category .u-type::text").getall() return u_types def save(self, images, name): path = os.path.join(self.output_folder, name + ".txt") if not os.path.exists(path): with open(path, "w", encoding="utf-8") as f: f.write("") for image in images: with open(path, "a") as f: f.write(image + "\n") def get_input(input_path): with open(input_path, "r", encoding="utf-8") as f: return f.read().split("\n") def main(): input_path = "input.txt" ygsf = YGSF(browser="edge") # edge firefox chrome input_data = get_input(input_path) for text in input_data: ygsf.search(text) if __name__ == '__main__': main()
08 五月 2025