#!/usr/bin/env python3 import requests import logging import xml.etree.ElementTree as ET import json import unidecode import sqlite3 import common import threading import re import traceback import sys from bs4 import BeautifulSoup from datetime import datetime from kafka import KafkaProducer def get_product_details(prod_url, config, kafka_producer, json_product): try: soup = common.get_soup_page_no_limit(prod_url) description = soup.find('div', class_='pdp-description__content') ean_row = soup.find_all('div', class_='row border-bottom') product = {} price = {} product['title'] = json_product['title'] product['url'] = prod_url if 'image' in json_product: product['image_url'] = json_product['image'] if description.find("p") is not None: product['info'] = description.find("p").text product['sku_code'] = json_product['sku'] for row in ean_row: if row.text.find('EAN- / barcode') > -1: ean_pattern = "([0-9]{8,13})" ean_match = re.search(ean_pattern, row.text) product['ean_code'] = ean_match.group(1) if 'brand' in json_product: product['brand'] = json_product['brand'] price['price'] = json_product['price'] if 'discount' in json_product: price['promo'] = 1 else: price['promo'] = 0 common.add_product(kafka_producer, config['hubo']['name'], product, price) except Exception as err: common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc()) def next_url(url_list): if len(url_list) == 0: return None key = next(iter(url_list)) url = url_list[key] del url_list[key] return url maxthreads = 5 sema = threading.Semaphore(value=maxthreads) threads = list() if __name__ == "__main__" : config = common.get_config() producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']}) common.clear_failed_product(config['hubo']['name']) get_doc = requests.get(config['hubo']['products_url']) data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore'))) offset = 1 while len(data_json['docs']) > 0: print(f'{offset}/{data_json["doc_count"]}') for product in data_json['docs']: thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product)) threads.append(thread) thread.start() offset = offset + int(data_json['limit']) get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset))) data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore'))) for t in threads: t.join() common.update_store_prices(producer, config['hubo'])