#!/usr/bin/env python3 import requests import logging import xml.etree.ElementTree as ET import json import unidecode import sqlite3 import common import threading import re import traceback from bs4 import BeautifulSoup from datetime import datetime from kafka import KafkaProducer def get_categories(url, cookie): soup = common.get_soup_page_no_limit(url, cookie) categories = {} dropdown = soup.find('div', class_="mxd-mega-dropdown") anchors = dropdown.find_all('a') for anchor in anchors: if anchor['href'].find('bricosolar') >= 0: print(f'remove {anchor["href"]}') pass elif anchor['href'] not in categories.values(): if anchor['href'].find('https://www.brico.be') == -1: anchor['href'] = f'https://www.brico.be{anchor["href"]}' categories[anchor.text] = anchor['href'] return categories def get_product_details(prod_url, config, kafka_producer, failed): try: soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie']) prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout') if prod_detail is None: return prod_specs = soup.find('div', {'id': 'specs'}) title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1') product = {} price = {} product['title'] = title.find("span", {'itemprop': 'name'}).text product['url'] = prod_url product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src'] product['info'] = prod_specs.find_all("p")[0].text product['sku_code'] = prod_url.split("/")[-1] ean_pattern = "([0-9]{8,13})" if len(prod_specs.find_all("p")) > 1: ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text) product['ean_code'] = ean_match.group(1) pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)' pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"' scripts = soup.find_all("script") for script in scripts: if script.string is not None: if script.string.find('"brands"') >= 0: match = re.search(pattern_brand, script.string) product['brand'] = match.group(1) if script.string.find('"ean"') >= 0 and 'ean_code' not in product: match = re.search(pattern_ean, script.string) product['ean_code'] = match.group(1) if soup.find("ins") is None: return price['price'] = soup.find("ins").find("meta")['content'] price['promo'] = 0 if soup.find("del") is None else 1 common.add_product(kafka_producer, config['brico']['name'], product, price) except Exception as err: common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc()) def next_url(url_list): if len(url_list) == 0: return None key = next(iter(url_list)) url = url_list[key] del url_list[key] return url maxthreads = 10 sema = threading.Semaphore(value=maxthreads) threads = list() if __name__ == "__main__" : config = common.get_config() producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']}) common.clear_failed_product(config['brico']['name']) all_categories = get_categories(config['brico']['url'], config['brico']['cookie']) process_categories = all_categories.copy() url = next_url(process_categories) products = {} failed_prod = {} i = 0 while url is not None: i = i + 1 print(f'{i}/{len(all_categories)} - {len(process_categories)}') soup = common.get_soup_page_no_limit(url, config['brico']['cookie']) page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive") extra_cat = soup.find_all("a", class_="mxd-product-list-link") if len(extra_cat) > 0: for cat in extra_cat: if cat['href'].find('bricosolar') >= 0: print(f'main remove {cat["href"]}') pass elif cat['href'] not in all_categories.values(): cat_name = cat.find("span").text if cat['href'].find("/") > 0: cat['href'] = f'/{cat["href"]}' process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}' #print(f'added {config["brico"]["url"]}{cat["href"]}') all_categories[cat_name] = cat['href'] for product in page_products: product = product.find("a", class_="mxd-block-card-link") if product['href'].find("/nl") == -1: product['href'] = f'/nl{product["href"]}' if product['href'].find("/") > 0: product['href'] = f'/{product["href"]}' if product['href'] not in products.values(): prod_name = product.find("span", {"itemprop": "name"}).text #print(f'product url: {product["href"]}') products[prod_name] = product["href"] thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod)) threads.append(thread) thread.start() next_page = soup.find("a", {"rel": "next"}) if next_page is None: url = next_url(process_categories) else: base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"}) url = f'{base["href"]}{next_page["href"]}' print(f'next page: {url}') for t in threads: t.join() common.update_store_prices(producer, config['brico'])