steven
/
cheap_shopper


								#!/usr/bin/env python3


								import requests

								import logging

								import xml.etree.ElementTree as ET

								import json

								import unidecode

								import sqlite3

								import common

								import threading

								import re

								import traceback


								from bs4 import BeautifulSoup

								from datetime import datetime

								from kafka import KafkaProducer


								def get_categories(url, cookie):

								    soup = common.get_soup_page_no_limit(url, cookie)


								    categories = {}


								    dropdown = soup.find('div', class_="mxd-mega-dropdown")


								    anchors = dropdown.find_all('a')

								    for anchor in anchors:

								        if anchor['href'].find('bricosolar') >= 0:

								            print(f'remove {anchor["href"]}')

								            pass

								        elif anchor['href'] not in categories.values():

								            if anchor['href'].find('https://www.brico.be') == -1:

								                anchor['href'] = f'https://www.brico.be{anchor["href"]}'

								            categories[anchor.text] = anchor['href']


								    return categories


								def get_product_details(prod_url, config, kafka_producer, failed):

								    try:

								        soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie'])


								        prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout')

								        if prod_detail is None:

								            return

								        prod_specs = soup.find('div', {'id': 'specs'})

								        title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1')


								        product = {}

								        price = {}

								        product['title'] = title.find("span", {'itemprop': 'name'}).text

								        product['url'] = prod_url

								        product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src']

								        product['info'] = prod_specs.find_all("p")[0].text

								        product['sku_code'] = prod_url.split("/")[-1]

								        ean_pattern = "([0-9]{8,13})"

								        if len(prod_specs.find_all("p")) > 1:

								            ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text)

								            product['ean_code'] = ean_match.group(1)


								        pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)'

								        pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"'

								        scripts = soup.find_all("script")

								        for script in scripts:

								            if script.string is not None:

								                if script.string.find('"brands"') >= 0:

								                    match = re.search(pattern_brand, script.string)

								                    product['brand'] = match.group(1)

								                if script.string.find('"ean"') >= 0 and 'ean_code' not in product:

								                    match = re.search(pattern_ean, script.string)

								                    product['ean_code'] = match.group(1)

								        if soup.find("ins") is None:

								            return

								        price['price'] = soup.find("ins").find("meta")['content']

								        price['promo'] = 0 if soup.find("del") is None else 1


								        common.add_product(kafka_producer, config['brico']['name'], product, price)

								    except Exception as err:

								        common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc())


								def next_url(url_list):

								    if len(url_list) == 0:

								        return None


								    key = next(iter(url_list))

								    url = url_list[key]

								    del url_list[key]

								    return url


								maxthreads = 10

								sema = threading.Semaphore(value=maxthreads)

								threads = list()


								if __name__ == "__main__" :

								    config = common.get_config()


								    producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])

								    common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']})

								    common.clear_failed_product(config['brico']['name'])


								    all_categories = get_categories(config['brico']['url'], config['brico']['cookie'])


								    process_categories = all_categories.copy()

								    url = next_url(process_categories)

								    products = {}

								    failed_prod = {}

								    i = 0

								    while url is not None:

								        i = i + 1

								        print(f'{i}/{len(all_categories)} - {len(process_categories)}')


								        soup = common.get_soup_page_no_limit(url, config['brico']['cookie'])

								        page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive")

								        extra_cat = soup.find_all("a", class_="mxd-product-list-link")


								        if len(extra_cat) > 0:

								            for cat in extra_cat:

								                if cat['href'].find('bricosolar') >= 0:

								                    print(f'main remove {cat["href"]}')

								                    pass

								                elif cat['href'] not in all_categories.values():

								                    cat_name = cat.find("span").text

								                    if cat['href'].find("/") > 0:

								                        cat['href'] = f'/{cat["href"]}'

								                    process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}'

								                    #print(f'added {config["brico"]["url"]}{cat["href"]}')

								                    all_categories[cat_name] = cat['href']


								        for product in page_products:

								            product = product.find("a", class_="mxd-block-card-link")

								            if product['href'].find("/nl") == -1:

								                product['href'] = f'/nl{product["href"]}'

								            if product['href'].find("/") > 0:

								                product['href'] = f'/{product["href"]}'

								            if product['href'] not in products.values():

								                prod_name = product.find("span", {"itemprop": "name"}).text

								                #print(f'product url: {product["href"]}')

								                products[prod_name] = product["href"]


								                thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod))

								                threads.append(thread)

								                thread.start()


								        next_page = soup.find("a", {"rel": "next"})

								        if next_page is None:

								            url = next_url(process_categories)

								        else:

								            base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"})

								            url = f'{base["href"]}{next_page["href"]}'

								            print(f'next page: {url}')


								    for t in threads:

								        t.join()


								    common.update_store_prices(producer, config['brico'])