|
| 1 | +import re |
| 2 | +import time |
| 3 | +import random |
| 4 | +from typing import Union, Literal, Optional, Tuple |
| 5 | +from playwright.sync_api import sync_playwright |
| 6 | +from playwright._impl._errors import TimeoutError |
| 7 | +from getuseragent import UserAgent |
| 8 | +from helpers.parsers_helpers import * |
| 9 | +from tqdm import tqdm |
| 10 | + |
| 11 | +from parsers_dataclasses import OzonProduct |
| 12 | + |
| 13 | + |
| 14 | +class Ozon: |
| 15 | + |
| 16 | + __version__ = "0.2.2" |
| 17 | + |
| 18 | + def __init__(self): |
| 19 | + """version = 0.2""" |
| 20 | + self.page = None |
| 21 | + self.goods_links: set[str] = set() |
| 22 | + self.parsing_result: list[dict] = [] |
| 23 | + self.scroller = open_scroller() |
| 24 | + self.base_link = "https://www.ozon.ru" |
| 25 | + |
| 26 | + def _get_goods_links(self, number_of_goods: Union[Literal['max'], int] = 10) -> None: |
| 27 | + """Сбор всех ссылок на товары. Либо собирается максимальное количество товаров, либо явно |
| 28 | + указанное количество (по умолчанию=10). |
| 29 | + version = 0.1.1 |
| 30 | + """ |
| 31 | + id_paginator_content = "#paginatorContent" |
| 32 | + tag_href = 'href' |
| 33 | + href_next_page = "Дальше" |
| 34 | + selector_next_page = f':text("{href_next_page}")' |
| 35 | + self.page.wait_for_selector(id_paginator_content) |
| 36 | + all_products = self.page.query_selector(id_paginator_content) |
| 37 | + next_page_link = all_products.evaluate_handle('element => element.nextElementSibling').query_selector('a') |
| 38 | + next_page_link = self.base_link + next_page_link.get_attribute(tag_href) |
| 39 | + links = set(all_products.query_selector_all('.tile-hover-target')) |
| 40 | + if number_of_goods == 'max': |
| 41 | + self.goods_links.update({self.base_link + link.get_attribute(tag_href) for link in links}) |
| 42 | + if self.page.is_visible(selector_next_page): |
| 43 | + self.page.goto(next_page_link) |
| 44 | + self._get_goods_links(number_of_goods) |
| 45 | + else: |
| 46 | + for link in links: |
| 47 | + if len(self.goods_links) < number_of_goods: |
| 48 | + self.goods_links.add(self.base_link + link.get_attribute(tag_href)) |
| 49 | + else: |
| 50 | + break |
| 51 | + if len(self.goods_links) < number_of_goods and self.page.is_visible(selector_next_page): |
| 52 | + self.page.goto(next_page_link) |
| 53 | + self._get_goods_links(number_of_goods) |
| 54 | + |
| 55 | + @staticmethod |
| 56 | + def __parse_prices(prices) -> Tuple[Optional[int], int, Optional[int]]: |
| 57 | + """Парсинг трёх видов цен из блока с ценами - цена с картой ozon, обычная цена, старая цена |
| 58 | + version = 0.1.1 |
| 59 | + """ |
| 60 | + empty, digit, spec_symbol = '', r'[^\d]', r'\\u2009' |
| 61 | + prices = list(map(lambda p: p.inner_text(), prices)) |
| 62 | + if len(prices) > 2: |
| 63 | + ozon_card_price, price, old_price = prices[1], prices[3], prices[4] |
| 64 | + ozon_card_price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, ozon_card_price))) |
| 65 | + price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, price))) |
| 66 | + if re.search(r'\d+', old_price): |
| 67 | + old_price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, old_price))) |
| 68 | + return ozon_card_price, price, old_price # все возможные цены |
| 69 | + return ozon_card_price, price, None # цены, кроме старой |
| 70 | + elif len(prices) == 1: |
| 71 | + return None, int(re.sub(digit, empty, re.sub(spec_symbol, empty, prices[0]))), None # только цена без карты |
| 72 | + price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, prices[0]))) |
| 73 | + old_price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, prices[1]))) |
| 74 | + return None, price, old_price # цены, кроме цены с картой |
| 75 | + |
| 76 | + @staticmethod |
| 77 | + def __parse_score_data(score_data: str) -> Tuple[Optional[float], Optional[int]]: |
| 78 | + """Парсинг средней оценки и количества отзывов |
| 79 | + version = 0.1.1 |
| 80 | + """ |
| 81 | + if score_data == 'Нет отзывов': |
| 82 | + return None, None # нет не средней оценки, не отзывов |
| 83 | + score = re.search(r'.+(?= •)', score_data) |
| 84 | + if score: # если есть средняя оценка |
| 85 | + score = float(score.group(0)) # есть и средняя оценка и количество отзывов |
| 86 | + reviews = int(re.search(r'(?<=• ).+(?= )', score_data).group(0).replace(' ', '')) |
| 87 | + return score, reviews |
| 88 | + |
| 89 | + def _get_good_descr(self, page_link: str) -> None: |
| 90 | + """Сбор информации о товаре на его странице |
| 91 | + version = 0.2 |
| 92 | + """ |
| 93 | + href = 'href' |
| 94 | + reload_button = "#reload-button" |
| 95 | + seller_selector = 'div[data-widget="webCurrentSeller"]' |
| 96 | + title_selector = 'div[data-widget="webProductHeading"]' |
| 97 | + score_data_selector = 'div[data-widget="webSingleProductScore"]' |
| 98 | + self.page.goto(page_link) |
| 99 | + time.sleep(random.uniform(.5, 2)) # ожидание загрузки страница анти-бот защиты (для первой ссылки в списке) |
| 100 | + if self.page.is_visible(reload_button): |
| 101 | + self.page.click("#reload-button") |
| 102 | + try: # проверка, что страница не блокируется страницей с ограничением возраста |
| 103 | + self.page.wait_for_selector(title_selector, timeout=5_000) |
| 104 | + product = OzonProduct(page_link) |
| 105 | + product.title = self.page.query_selector(title_selector) |
| 106 | + product.article = self.page.query_selector('button[data-widget="webDetailSKU"]') |
| 107 | + product.category = self.page.query_selector('div[data-widget="breadCrumbs"]') |
| 108 | + prices = self.page.query_selector('div[data-widget="webPrice"]').query_selector_all('span') |
| 109 | + product.ozon_card_price, product.price, product.old_price = self.__parse_prices(prices) |
| 110 | + self.page.wait_for_selector(score_data_selector, timeout=5_000) |
| 111 | + score_data = self.page.query_selector(score_data_selector).inner_text() |
| 112 | + product.score, product.reviews = self.__parse_score_data(score_data) |
| 113 | + while not self.page.is_visible(seller_selector): |
| 114 | + self.page.evaluate(self.scroller) |
| 115 | + seller_data = self.page.query_selector(seller_selector).query_selector_all('a') |
| 116 | + product.seller = seller_data[1].inner_text() |
| 117 | + product.seller_href = seller_data[0].get_attribute(href) |
| 118 | + product.refund = self.page.query_selector(seller_selector).query_selector_all('li')[-1].inner_text() |
| 119 | + product.description = self.page.query_selector('div[data-widget="webDescription"]') |
| 120 | + self.parsing_result.append(product.dict()) |
| 121 | + except TimeoutError: |
| 122 | + pass |
| 123 | + |
| 124 | + def find_all_goods(self, keyword: str, number_of_goods: Union[Literal['max'], int] = 10) -> None: |
| 125 | + """Поиск всех ссылок на товары по ключевому слову |
| 126 | + version = 0.1.2 |
| 127 | + """ |
| 128 | + empty_selector = """ |
| 129 | + Простите, по вашему запросу товаров сейчас нет. |
| 130 | + """ |
| 131 | + with sync_playwright() as playwright: |
| 132 | + browser = playwright.chromium.launch(headless=True, args=LAUNCH_ARGS) |
| 133 | + context = browser.new_context(user_agent=UserAgent("chrome+firefox").Random()) |
| 134 | + self.page = context.new_page() |
| 135 | + self.page.goto(self.base_link) # открытие ссылки сайта |
| 136 | + time.sleep(random.uniform(1, 3)) |
| 137 | + self.page.click("#reload-button") |
| 138 | + time.sleep(random.uniform(2, 3)) |
| 139 | + self.page.get_by_placeholder("Искать на Ozon").type(keyword, delay=random.uniform(.1, .5)) |
| 140 | + self.page.query_selector('button[aria-label="Поиск"]').click(delay=random.randint(100, 500)) |
| 141 | + try: # проверка, что по запросу ничего не найдено |
| 142 | + self.page.wait_for_selector(f'text="{empty_selector}"', timeout=3_000) |
| 143 | + except TimeoutError: # если по запросу найдены товары |
| 144 | + self._get_goods_links(number_of_goods) |
| 145 | + finally: |
| 146 | + browser.close() |
| 147 | + |
| 148 | + def describe_all_goods(self) -> Optional[list[dict]]: |
| 149 | + """Создание итогового датасета характеристик всех найденных товаров |
| 150 | + version = 0.1 |
| 151 | + """ |
| 152 | + if len(self.goods_links): # проверка, что ссылки на товары были найдены |
| 153 | + with sync_playwright() as playwright: |
| 154 | + browser = playwright.chromium.launch(headless=True, args=LAUNCH_ARGS) |
| 155 | + context = browser.new_context(user_agent=UserAgent("chrome+firefox").Random()) |
| 156 | + self.page = context.new_page() |
| 157 | + for link in tqdm(self.goods_links, ascii=True): # сбор данных всех товаров |
| 158 | + time.sleep(random.random()) |
| 159 | + self._get_good_descr(link) |
| 160 | + browser.close() |
| 161 | + return self.parsing_result |
| 162 | + return None |
0 commit comments