#!/usr/bin/env python3 import requests import logging import xml.etree.ElementTree as ET import json import unidecode import sqlite3 import common import threading import re import traceback import sys import itertools import logging #logging.basicConfig(level=logging.DEBUG) from time import sleep from random import randint from bs4 import BeautifulSoup from datetime import datetime from kafka import KafkaProducer from ratelimit import limits, sleep_and_retry def get_categories(url): soup = common.get_soup_page(url) categories = {} articles = soup.find_all('article', class_="item bg-gray") for article in articles: anchor = article.find('a') anchor_title = anchor['title'].split(' | ')[0] categories[anchor_title] = anchor['href'] return categories def get_product_details(prod_url, config, kafka_producer, anchor_title): try: soup = common.get_soup_page(prod_url) if soup is None: common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None) return prod_detail = soup.find('article') if prod_detail is None: common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None) return title = prod_detail.find('h1', {'itemprop': 'name'}) image = prod_detail.find('a', class_='image-click').find('img') description = prod_detail.find('div', {'itemprop': 'description'}) ean_code = prod_detail.find('td', {'itemprop': 'eancode'}) sku = prod_detail.find('span', {'itemprop': 'sku'}) brand = prod_detail.find('span', {'itemprop': 'brand'}) price_detail = prod_detail.find('span', {'itemprop': 'price'}) product = {} price = {} if title is not None: product['title'] = title.text.strip() product['url'] = prod_url product['image_url'] = image['src'] if description is not None: product['info'] = description.text.strip() if sku is not None: product['sku_code'] = sku.text.strip() if ean_code is not None: product['ean_code'] = ean_code.text.strip() if brand is not None: product['brand'] = brand.text.strip() price['price'] = price_detail.text.split()[1] price['promo'] = 0 if soup.find("del") is None else 1 common.add_product(kafka_producer, config['brouwland']['name'], product, price) except Exception as err: print(traceback.format_exc()) common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc()) def next_url(url_list): if len(url_list) == 0: return None key = next(iter(url_list)) url = url_list[key] del url_list[key] return url maxthreads = 5 sema = threading.Semaphore(value=maxthreads) threads = list() if __name__ == "__main__" : config = common.get_config() producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']}) common.clear_failed_product(config['brouwland']['name']) all_categories = get_categories(config['brouwland']['catalogue_url']) process_categories = all_categories.copy() url = next_url(process_categories) products = {} i = 0 while url is not None: i = i + 1 print(f'{i}/{len(all_categories)} - {len(process_categories)}') url = f'{config["brouwland"]["url"]}{url}' soup = common.get_soup_page(url) if soup is None: url = next_url(process_categories) continue page_products = soup.find_all("article", class_="product") extra_cat = soup.find_all("article", class_="item bg-gray") if len(extra_cat) > 0: for cat in extra_cat: anchor = cat.find('a') if anchor['href'] not in all_categories.values(): anchor_title = anchor['title'].split(' | ')[0] process_categories[anchor_title] = f'{anchor["href"]}' all_categories[anchor_title] = anchor['href'] #print(f'added {cat["data-href"]}') for product in page_products: anchor = product.find("a") if anchor['href'] not in products.values(): anchor_title = anchor['title'].split(' | ')[0] if anchor_title.upper().find("CADEAU") > -1: continue products[anchor_title] = anchor["href"] thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title)) threads.append(thread) thread.start() next_page = soup.find("a", class_="next") if next_page is None: url = next_url(process_categories) else: url = next_page["href"] for t in threads: t.join() common.update_store_prices(producer, config['brouwland'])