steven
/
cheap_shopper

#!/usr/bin/env python3
import requestsimport loggingimport xml.etree.ElementTree as ETimport jsonimport unidecodeimport sqlite3import commonimport threadingimport reimport traceback
from bs4 import BeautifulSoupfrom datetime import datetimefrom kafka import KafkaProducer

def get_categories(url, cookie):    soup = common.get_soup_page_no_limit(url, cookie)
    categories = {}        dropdown = soup.find('div', class_="mxd-mega-dropdown")
    anchors = dropdown.find_all('a')    for anchor in anchors:        if anchor['href'].find('bricosolar') >= 0:            print(f'remove {anchor["href"]}')            pass        elif anchor['href'] not in categories.values():            if anchor['href'].find('https://www.brico.be') == -1:                anchor['href'] = f'https://www.brico.be{anchor["href"]}'            categories[anchor.text] = anchor['href']
    return categories

def get_product_details(prod_url, config, kafka_producer, failed):    try:        soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie'])
        prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout')        if prod_detail is None:            return        prod_specs = soup.find('div', {'id': 'specs'})        title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1')
        product = {}        price = {}        product['title'] = title.find("span", {'itemprop': 'name'}).text        product['url'] = prod_url        product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src']        product['info'] = prod_specs.find_all("p")[0].text        product['sku_code'] = prod_url.split("/")[-1]        ean_pattern = "([0-9]{8,13})"        if len(prod_specs.find_all("p")) > 1:            ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text)            product['ean_code'] = ean_match.group(1)
        pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)'        pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"'        scripts = soup.find_all("script")        for script in scripts:             if script.string is not None:                if script.string.find('"brands"') >= 0:                    match = re.search(pattern_brand, script.string)                    product['brand'] = match.group(1)                if script.string.find('"ean"') >= 0 and 'ean_code' not in product:                    match = re.search(pattern_ean, script.string)                    product['ean_code'] = match.group(1)        if soup.find("ins") is None:            return        price['price'] = soup.find("ins").find("meta")['content']        price['promo'] = 0 if soup.find("del") is None else 1
        common.add_product(kafka_producer, config['brico']['name'], product, price)    except Exception as err:        common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc())

def next_url(url_list):    if len(url_list) == 0:        return None
    key = next(iter(url_list))    url = url_list[key]    del url_list[key]    return url

maxthreads = 10sema = threading.Semaphore(value=maxthreads)threads = list()
if __name__ == "__main__" :    config = common.get_config()
    producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])    common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']})    common.clear_failed_product(config['brico']['name'])
    all_categories = get_categories(config['brico']['url'], config['brico']['cookie'])
    process_categories = all_categories.copy()    url = next_url(process_categories)    products = {}    failed_prod = {}    i = 0    while url is not None:        i = i + 1        print(f'{i}/{len(all_categories)} - {len(process_categories)}')                soup = common.get_soup_page_no_limit(url, config['brico']['cookie'])        page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive")        extra_cat = soup.find_all("a", class_="mxd-product-list-link")
        if len(extra_cat) > 0:            for cat in extra_cat:                if cat['href'].find('bricosolar') >= 0:                    print(f'main remove {cat["href"]}')                    pass                elif cat['href'] not in all_categories.values():                    cat_name = cat.find("span").text                    if cat['href'].find("/") > 0:                        cat['href'] = f'/{cat["href"]}'                    process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}'                    #print(f'added {config["brico"]["url"]}{cat["href"]}')                    all_categories[cat_name] = cat['href']
        for product in page_products:            product = product.find("a", class_="mxd-block-card-link")            if product['href'].find("/nl") == -1:                product['href'] = f'/nl{product["href"]}'            if product['href'].find("/") > 0:                product['href'] = f'/{product["href"]}'            if product['href'] not in products.values():                prod_name = product.find("span", {"itemprop": "name"}).text                #print(f'product url: {product["href"]}')                products[prod_name] = product["href"]
                thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod))                threads.append(thread)                thread.start()
        next_page = soup.find("a", {"rel": "next"})        if next_page is None:            url = next_url(process_categories)        else:            base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"})            url = f'{base["href"]}{next_page["href"]}'            print(f'next page: {url}')    
    for t in threads:        t.join()
    common.update_store_prices(producer, config['brico'])