Skip to content

Commit 60ee154

Browse files
committed
Merge branch 'ozon'
2 parents 1849ad8 + ee842f3 commit 60ee154

File tree

6 files changed

+302
-0
lines changed

6 files changed

+302
-0
lines changed

helpers/parsers_helpers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
11
def open_scroller():
2+
"""Js функция для симуляции прокрутки страницы
3+
version = 0.1
4+
"""
25
with open("../helpers/scrollFunc.js", 'r') as file:
36
return file.read()
7+
8+
9+
LAUNCH_ARGS = ['--disable-blink-features=AutomationControlled']

parsers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
from parsers.wildberries_parser import Wildberries
2+
from parsers.ozon_parser import Ozon

parsers/ozon_parser.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
import re
2+
import time
3+
import random
4+
from typing import Union, Literal, Optional, Tuple
5+
from playwright.sync_api import sync_playwright
6+
from playwright._impl._errors import TimeoutError
7+
from getuseragent import UserAgent
8+
from helpers.parsers_helpers import *
9+
from tqdm import tqdm
10+
11+
from parsers_dataclasses import OzonProduct
12+
13+
14+
class Ozon:
15+
16+
__version__ = "0.2.2"
17+
18+
def __init__(self):
19+
"""version = 0.2"""
20+
self.page = None
21+
self.goods_links: set[str] = set()
22+
self.parsing_result: list[dict] = []
23+
self.scroller = open_scroller()
24+
self.base_link = "https://www.ozon.ru"
25+
26+
def _get_goods_links(self, number_of_goods: Union[Literal['max'], int] = 10) -> None:
27+
"""Сбор всех ссылок на товары. Либо собирается максимальное количество товаров, либо явно
28+
указанное количество (по умолчанию=10).
29+
version = 0.1.1
30+
"""
31+
id_paginator_content = "#paginatorContent"
32+
tag_href = 'href'
33+
href_next_page = "Дальше"
34+
selector_next_page = f':text("{href_next_page}")'
35+
self.page.wait_for_selector(id_paginator_content)
36+
all_products = self.page.query_selector(id_paginator_content)
37+
next_page_link = all_products.evaluate_handle('element => element.nextElementSibling').query_selector('a')
38+
next_page_link = self.base_link + next_page_link.get_attribute(tag_href)
39+
links = set(all_products.query_selector_all('.tile-hover-target'))
40+
if number_of_goods == 'max':
41+
self.goods_links.update({self.base_link + link.get_attribute(tag_href) for link in links})
42+
if self.page.is_visible(selector_next_page):
43+
self.page.goto(next_page_link)
44+
self._get_goods_links(number_of_goods)
45+
else:
46+
for link in links:
47+
if len(self.goods_links) < number_of_goods:
48+
self.goods_links.add(self.base_link + link.get_attribute(tag_href))
49+
else:
50+
break
51+
if len(self.goods_links) < number_of_goods and self.page.is_visible(selector_next_page):
52+
self.page.goto(next_page_link)
53+
self._get_goods_links(number_of_goods)
54+
55+
@staticmethod
56+
def __parse_prices(prices) -> Tuple[Optional[int], int, Optional[int]]:
57+
"""Парсинг трёх видов цен из блока с ценами - цена с картой ozon, обычная цена, старая цена
58+
version = 0.1.1
59+
"""
60+
empty, digit, spec_symbol = '', r'[^\d]', r'\\u2009'
61+
prices = list(map(lambda p: p.inner_text(), prices))
62+
if len(prices) > 2:
63+
ozon_card_price, price, old_price = prices[1], prices[3], prices[4]
64+
ozon_card_price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, ozon_card_price)))
65+
price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, price)))
66+
if re.search(r'\d+', old_price):
67+
old_price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, old_price)))
68+
return ozon_card_price, price, old_price # все возможные цены
69+
return ozon_card_price, price, None # цены, кроме старой
70+
elif len(prices) == 1:
71+
return None, int(re.sub(digit, empty, re.sub(spec_symbol, empty, prices[0]))), None # только цена без карты
72+
price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, prices[0])))
73+
old_price = int(re.sub(digit, empty, re.sub(spec_symbol, empty, prices[1])))
74+
return None, price, old_price # цены, кроме цены с картой
75+
76+
@staticmethod
77+
def __parse_score_data(score_data: str) -> Tuple[Optional[float], Optional[int]]:
78+
"""Парсинг средней оценки и количества отзывов
79+
version = 0.1.1
80+
"""
81+
if score_data == 'Нет отзывов':
82+
return None, None # нет не средней оценки, не отзывов
83+
score = re.search(r'.+(?= •)', score_data)
84+
if score: # если есть средняя оценка
85+
score = float(score.group(0)) # есть и средняя оценка и количество отзывов
86+
reviews = int(re.search(r'(?<=• ).+(?= )', score_data).group(0).replace(' ', ''))
87+
return score, reviews
88+
89+
def _get_good_descr(self, page_link: str) -> None:
90+
"""Сбор информации о товаре на его странице
91+
version = 0.2
92+
"""
93+
href = 'href'
94+
reload_button = "#reload-button"
95+
seller_selector = 'div[data-widget="webCurrentSeller"]'
96+
title_selector = 'div[data-widget="webProductHeading"]'
97+
score_data_selector = 'div[data-widget="webSingleProductScore"]'
98+
self.page.goto(page_link)
99+
time.sleep(random.uniform(.5, 2)) # ожидание загрузки страница анти-бот защиты (для первой ссылки в списке)
100+
if self.page.is_visible(reload_button):
101+
self.page.click("#reload-button")
102+
try: # проверка, что страница не блокируется страницей с ограничением возраста
103+
self.page.wait_for_selector(title_selector, timeout=5_000)
104+
product = OzonProduct(page_link)
105+
product.title = self.page.query_selector(title_selector)
106+
product.article = self.page.query_selector('button[data-widget="webDetailSKU"]')
107+
product.category = self.page.query_selector('div[data-widget="breadCrumbs"]')
108+
prices = self.page.query_selector('div[data-widget="webPrice"]').query_selector_all('span')
109+
product.ozon_card_price, product.price, product.old_price = self.__parse_prices(prices)
110+
self.page.wait_for_selector(score_data_selector, timeout=5_000)
111+
score_data = self.page.query_selector(score_data_selector).inner_text()
112+
product.score, product.reviews = self.__parse_score_data(score_data)
113+
while not self.page.is_visible(seller_selector):
114+
self.page.evaluate(self.scroller)
115+
seller_data = self.page.query_selector(seller_selector).query_selector_all('a')
116+
product.seller = seller_data[1].inner_text()
117+
product.seller_href = seller_data[0].get_attribute(href)
118+
product.refund = self.page.query_selector(seller_selector).query_selector_all('li')[-1].inner_text()
119+
product.description = self.page.query_selector('div[data-widget="webDescription"]')
120+
self.parsing_result.append(product.dict())
121+
except TimeoutError:
122+
pass
123+
124+
def find_all_goods(self, keyword: str, number_of_goods: Union[Literal['max'], int] = 10) -> None:
125+
"""Поиск всех ссылок на товары по ключевому слову
126+
version = 0.1.2
127+
"""
128+
empty_selector = """
129+
Простите, по вашему запросу товаров сейчас нет.
130+
"""
131+
with sync_playwright() as playwright:
132+
browser = playwright.chromium.launch(headless=True, args=LAUNCH_ARGS)
133+
context = browser.new_context(user_agent=UserAgent("chrome+firefox").Random())
134+
self.page = context.new_page()
135+
self.page.goto(self.base_link) # открытие ссылки сайта
136+
time.sleep(random.uniform(1, 3))
137+
self.page.click("#reload-button")
138+
time.sleep(random.uniform(2, 3))
139+
self.page.get_by_placeholder("Искать на Ozon").type(keyword, delay=random.uniform(.1, .5))
140+
self.page.query_selector('button[aria-label="Поиск"]').click(delay=random.randint(100, 500))
141+
try: # проверка, что по запросу ничего не найдено
142+
self.page.wait_for_selector(f'text="{empty_selector}"', timeout=3_000)
143+
except TimeoutError: # если по запросу найдены товары
144+
self._get_goods_links(number_of_goods)
145+
finally:
146+
browser.close()
147+
148+
def describe_all_goods(self) -> Optional[list[dict]]:
149+
"""Создание итогового датасета характеристик всех найденных товаров
150+
version = 0.1
151+
"""
152+
if len(self.goods_links): # проверка, что ссылки на товары были найдены
153+
with sync_playwright() as playwright:
154+
browser = playwright.chromium.launch(headless=True, args=LAUNCH_ARGS)
155+
context = browser.new_context(user_agent=UserAgent("chrome+firefox").Random())
156+
self.page = context.new_page()
157+
for link in tqdm(self.goods_links, ascii=True): # сбор данных всех товаров
158+
time.sleep(random.random())
159+
self._get_good_descr(link)
160+
browser.close()
161+
return self.parsing_result
162+
return None

parsers_dataclasses/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
from parsers_dataclasses.wildberries_dataclasses import WildberriesProduct
2+
from parsers_dataclasses.ozon_dataclasses import OzonProduct
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import re
2+
from typing import Optional, Union
3+
from dataclasses import dataclass, field, asdict
4+
from playwright.sync_api._generated import ElementHandle
5+
6+
7+
@dataclass
8+
class OzonProduct:
9+
10+
__version__ = "0.1.1"
11+
__base_link = "https://www.ozon.ru"
12+
13+
page_link: str = field(init=True) # ссылка на страницу товара
14+
title: str = field(init=False) # название товара
15+
article: int = field(init=False) # артикул товара
16+
category: str = field(init=False) # категория товара
17+
ozon_card_price: Optional[int] = field(init=False) # цена с картой озон
18+
price: int = field(init=False) # цена без карты озон
19+
old_price: Optional[int] = field(init=False) # цена без скидки
20+
score: Optional[float] = field(init=False) # средняя оценка товара
21+
reviews: Optional[int] = field(init=False) # количество отзывов на товар
22+
seller: str = field(init=False) # продавец товара
23+
seller_href: str = field(init=False) # ссылка на другие товары продавца
24+
refund: str = field(init=False) # наличие возврата
25+
description: Optional[str] = field(init=False) # описание возврата
26+
27+
def dict(self):
28+
"""version = 0.1"""
29+
return {k: v for k, v in asdict(self).items()}
30+
31+
def __setattr__(self, key: str, value: Optional[Union[int, float, str, ElementHandle]]):
32+
"""version = 0.1"""
33+
if key == 'title':
34+
value = value.query_selector('h1').inner_text()
35+
elif key == 'article':
36+
value = int(re.sub(r'[^\d]', '', value.query_selector('div').inner_text()))
37+
elif key == 'category':
38+
value = value.inner_text().replace('\n', '/')
39+
elif key == 'description':
40+
value = value.inner_text()
41+
value = re.sub(r' Показать полностью$', '', re.sub(r'^Описание ', '', value.replace('\n', ' ')))
42+
value = None if value == 'Показать полностью' else value
43+
super().__setattr__(key, value)

test/test_ozon.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import time
2+
import pprint
3+
import unittest
4+
from parsers import Ozon
5+
from getuseragent import UserAgent
6+
from playwright.sync_api import sync_playwright
7+
from helpers.parsers_helpers import LAUNCH_ARGS
8+
9+
10+
class TestOzon(unittest.TestCase):
11+
12+
def setUp(self):
13+
self.ozon = Ozon()
14+
self.keyword = "гантели разборные"
15+
self.error_keyword = "blahblahblah"
16+
17+
def __test_product(self, link):
18+
pp = pprint.PrettyPrinter(indent=4)
19+
with sync_playwright() as playwright:
20+
browser = playwright.chromium.launch(headless=True, args=LAUNCH_ARGS)
21+
context = browser.new_context(user_agent=UserAgent("chrome+firefox").Random())
22+
self.ozon.page = context.new_page()
23+
self.ozon._get_good_descr(link)
24+
pp.pprint(self.ozon.parsing_result)
25+
browser.close()
26+
27+
def test_not_implicit_goods(self):
28+
time.sleep(1)
29+
self.ozon.find_all_goods(self.keyword)
30+
self.assertEqual(10, len(self.ozon.goods_links))
31+
32+
def test_not_explicit_goods(self):
33+
time.sleep(1)
34+
self.ozon.find_all_goods(self.keyword, 60)
35+
self.assertEqual(60, len(self.ozon.goods_links))
36+
37+
def test_empty_page(self):
38+
time.sleep(1)
39+
self.ozon.find_all_goods(self.error_keyword)
40+
self.assertEqual(0, len(self.ozon.goods_links))
41+
42+
def test_many_products(self):
43+
time.sleep(1)
44+
self.ozon.find_all_goods(self.keyword, number_of_goods=50)
45+
pprint.PrettyPrinter(indent=4).pprint(self.ozon.describe_all_goods())
46+
self.assertNotEqual(0, len(self.ozon.parsing_result))
47+
48+
def test_one_product(self):
49+
time.sleep(1)
50+
self.__test_product('https://www.ozon.ru/product/ganteli-razbornye-nabor-2-ganteli-po-20-kg-shtanga'
51+
'-tsement-plastik-metal-obshchiy-ves-40kg-259855059/?__rr=1&advert'
52+
'=502zKeKFvgJ7CKGWutSLxYaIXVbUhP1b7fbj5Mq-mqUN'
53+
'-jsnmkWXUPoFPL7osUC3zE7rvGb9zKgRHyUiv58TMuCPfO9wgnhVo11OUL7ulJRFDThe9M2WzJQhDov3Fua'
54+
'49GtwO6I-7xVJqx8lLe2IrEBQ8iuMw4KFQNxoejkIMLFb9fCLx0joZOTnvtsVVJKzMUkQ6gsZANHVByhWb-'
55+
'n6WhSYKgPZjWOtZ_ykPblSK0mgBfVYbf23RbV0dR6rcWXtFXcTgUV68Q2coMd9ybF1ExhhiMmfNhGK6S3l1'
56+
'f1wpeS1bVwfySYNcYSve1YoN2E5RJgn2vtJKy55z1WPd0YW8esEzhL8KX8GqCXrwlPuw42PGNaATcBBwf6T'
57+
'Ne05SpWg&avtc=1&avte=2&avts=1722161757&keywords=гантели+разборные')
58+
59+
def test_second_product(self):
60+
time.sleep(1)
61+
self.__test_product('https://www.ozon.ru/product/polotentse-dlya-litsa-ruk-lavsan-28x38-sm-raznotsvetnyy-3-sht'
62+
'-1640509000/?advert=d5Yq1oat619zv0QWYL_BItEmMqvxJ-zPgjFcPJANGCCqp-'
63+
'U573dxV97K59h7QuEMKeuiSwqrkqz89U85-l-u2WINf-3dk73iBUBF3yexrsqeR0EfxpUIuUetNcz-'
64+
'dDw3l1utJBVKm4f65Lia-zOMjyHzB2aC09asAINM4jdRVrG0y_KYNQoMwi1Oyt_r_XJj8kwL2lHireJv_'
65+
'qFGY6KadRuf77DSwc6nYPadP2HBCuSSCRv8TspHJIK8uD4Ia_'
66+
'goIBb2XahOV0cfeTJZufYWD4lUdV1jZYox0cdZWgS-'
67+
'rBawGBqIfraDvZHHYPCSyUPSp5vdfkhnS0VwCnia4sGm8uJf6Zbb4Q6UmXZvm7NFlc5fWEQ&avtc=1&avte='
68+
'2&avts=1722247095&keywords=%D0%BF%D0%BE%D0%BB%D0%BE%D1%82%D0%B5%D0%BD%D1%86%D0%B0+'
69+
'%D0%BA%D1%83%D1%85%D0%BE%D0%BD%D0%BD%D1%8B%D0%B5')
70+
71+
def test_third_product(self):
72+
time.sleep(1)
73+
self.__test_product('https://www.ozon.ru/product/nosovye-platochki-semya-i-komfort-siren-3-sloya-10sht-h3sht-'
74+
'1582316998/?asb=JRrLrfQ439zOnGrAbm1QthyYJuEtPs634mnNsJFanP0%253D&asb2=mJsQREIkQwVwMeMk_'
75+
'PPzyp8011kE1aUiUPa9inwldBj4K3oc3sDamJ1Q2skC5uS7TPCxD35PJt9rKzhoXz4gIQ&avtc=1&avte=2&avts='
76+
'1722249508&keywords=%D0%BE%D0%B4%D0%BD%D0%BE%D1%80%D0%B0%D0%B7%D0%BE%D0%B2%D1%8B%D0%B5+'
77+
'%D0%BF%D0%BB%D0%B0%D1%82%D0%BA%D0%B8')
78+
79+
def test_fourth_product(self):
80+
time.sleep(1)
81+
self.__test_product('https://www.ozon.ru/product/floom-bumazhnye-platki-10-sht-1259633693/?asb=%252Bdrc97kz%'
82+
'252FCAs6D%252FyyagqoMGYUAQWWjC02N9NT%252Fe4jF4%253D&asb2=S4EEm5UMHgKepW-'
83+
'Xo1iva0kdUnG3x99F6Wypb7tU3q0v_tKM0XoQ0WZE9H1Al_iHprtpK5l73PjmHn_yJOBKfw&avtc=2&avte='
84+
'1&avts=1722251068&keywords=%D0%BE%D0%B4%D0%BD%D0%BE%D1%80%D0%B0%D0%B7%D0%BE%D0%B2%D1%'
85+
'8B%D0%B5+%D0%BF%D0%BB%D0%B0%D1%82%D0%BA%D0%B8')
86+
87+
88+
if __name__ == "__main__":
89+
unittest.main()

0 commit comments

Comments
 (0)