#!/usr/bin/env python3 import requests import logging import xml.etree.ElementTree as ET import json import unidecode import sqlite3 import common import threading import traceback from bs4 import BeautifulSoup from datetime import datetime from kafka import KafkaProducer def get_categories(cat_url): page = requests.get(cat_url) content = page.content[1:-1].decode('utf-8') content = content.replace("\\r\\n", "") content = content.replace('\\/', '/') content = content.replace('\\"', '"') content = content.replace(' & ', '&') root = ET.fromstring(content) categories = {} for elm in list(root.iter()): if elm.tag == "a": if (elm.attrib['href'] == '#') or (elm.text is None): continue if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0): continue if elm.attrib['href'] not in categories.values(): categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "") return categories def next_url(url_list): if len(url_list) == 0: return None key = next(iter(url_list)) url = url_list[key] del url_list[key] return url def get_product_details(prod_url, config, kafka_producer): soup = common.get_soup_page_no_limit(prod_url) if soup is None: return try: prod_view = soup.find("div", class_="product-view") prod_essential = prod_view.find("div", class_="product-essential") image = prod_view.find("img", {'itemprop': 'image'})['src'] price = prod_essential.find("meta", {'itemprop': 'price'})['content'] special_price = prod_essential.find("p", class_="special-price") promo = False promo_end = None if special_price is not None: promo = True promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"}) if promo_end is not None: promo_end = promo_end['content'] title = prod_view.find("h1", {'itemprop': 'name'}).text sku = prod_view.find("meta", {'itemprop': 'sku'})['content'] brand = prod_view.find("meta", {'itemprop': 'brand'})['content'] description = prod_view.find("meta", {'itemprop': 'description'})['content'] specs = prod_view.find("div", class_="tab-content tab-block-additional") spec_li = specs.find_all("li") info = '' for spec in spec_li: label = spec.find("span", class_="label").text content = spec.find("span", class_="data").text if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1: info = f'{info}{label}: {content} / ' info = info[:-3] ean_code = '' ean_list = prod_view.find_all("li") for elm in ean_list: if elm is not None: if elm.text.upper().find("EAN") >= 0: ean_code = elm.find("span", class_="data").text product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info} price_details = {'price': price, "promo": promo, "promo_end": promo_end} common.add_product(kafka_producer, config['fun']['name'], product_details, price_details) except Exception as err: common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc()) maxthreads = 10 sema = threading.Semaphore(value=maxthreads) threads = list() if __name__ == "__main__": config = common.get_config() producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']}) common.clear_failed_product(config['fun']['name']) all_categories = get_categories(config['fun']['categories_url']) process_categories = all_categories.copy() url = next_url(process_categories) products = {} i = 0 while url is not None: i = i + 1 print(f'{i}/{len(all_categories)} - {len(process_categories)}') cat_url = config["fun"]["url"] + url soup = common.get_soup_page_no_limit(cat_url) subcat = soup.find_all("li", class_="item last") if len(subcat) == 0: big_urls = soup.find_all("big") for b_url in big_urls: b_href = b_url.find("a") if b_href: try: b_url = b_href['href'].replace(config["fun"]["url"], "") if b_url not in all_categories.values(): process_categories[b_href.text] = b_url all_categories[b_href.text] = b_url except Exception as err: print(url) print("+++++++++++++") print(cat_url) print("+++++++++++++") print(b_href) print("+++++++++++++") print(err) print("=============") url = next_url(process_categories) else: for sc in subcat: product = sc.find("h2", class_="product-name") p_info = product.find("a") if p_info['href'] not in products.values(): products[p_info['title']] = p_info['href'] thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer)) threads.append(thread) thread.start() next_page = soup.find("a", class_="next i-next") if next_page is None: url = next_url(process_categories) else: url = next_page['href'].replace(config["fun"]["url"], "") for t in threads: t.join() common.update_store_prices(producer, config['fun'])