#!/usr/bin/env python3 import requests import logging import xml.etree.ElementTree as ET import json import unidecode import sqlite3 import common import traceback import sys from bs4 import BeautifulSoup from datetime import datetime from kafka import KafkaProducer def get_categories(url): page = requests.get(url) root = ET.fromstring(page.content) categories = {} for cat_elm in root.findall("./Data/CategoryNavigationViewModel"): cat_label = cat_elm.find("Label") cat_url = cat_elm.find("Url") categories[cat_label.text] = cat_url.text return categories def get_products(config, cat_url, kafka_producer): url = config['action']['products_url'] url = url.replace("__url__", cat_url) url = url.replace("__offset__", "0") total_url = url.replace("__limit__", "1") products_cookie = config['action']['products_cookie'] get_total = requests.get(total_url, cookies=products_cookie) data_json = json.loads(unidecode.unidecode(get_total.content.decode('utf-8', 'ignore'))) totalCount = data_json['totalCount'] all_url = url.replace("__limit__", str(totalCount)) products = requests.get(all_url, cookies=products_cookie) products_json = json.loads(unidecode.unidecode(products.content.decode('utf-8', 'ignore'))) for product_json in products_json['items']: try: if product_json['type'] != 'product': continue product = {} price = {} product['title'] = product_json['title'] product['url'] = config['action']['url'] + product_json['url'] product['image_url'] = config['action']['url'] + product_json['imageUrl'] product['info'] = product_json['subTitle'] product['sku_code'] = product_json['code'] for spec in product_json['specifications']: if spec['id'] == 'attEANCodeVariant': product['ean_code'] = spec['value'] if spec['id'] == 'attLongDescription': product['description'] = spec['value'] product['brand'] = product_json['brandName'] price['price'] = product_json['price'] price['promo'] = 0 if product_json['isDeal'] == False else 1 price['promo_start'] = product_json['dealStartDate'] price['promo_end'] = product_json['dealEndDate'] common.add_product(kafka_producer, config['action']['name'], product, price) except Exception as err: common.dump_failed_product(config['action']['name'], cat_url, product_json, err, traceback.format_exc()) if __name__ == "__main__" : config = common.get_config() producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) common.insert_update_store(producer, {'store': config['action']['name'], 'url': config['action']['url'], 'image_url': config['action']['logo']}) common.clear_failed_product(config['action']['name']) categories = get_categories(config['action']['categories_url']) for cat in categories: soup = common.get_soup_page_no_limit(f"{config['action']['url']}{categories[cat]}") subcat = soup.find_all("a", class_="subcategory-cta-list__cta") for sc in subcat: get_products(config, sc['href'], producer) common.update_store_prices(producer, config['action'])