steven
/
cheap_shopper

#!/usr/bin/env python3
import requestsimport loggingimport xml.etree.ElementTree as ETimport jsonimport unidecodeimport sqlite3import commonimport threadingimport reimport tracebackimport sysimport itertoolsimport logging#logging.basicConfig(level=logging.DEBUG)
from time import sleepfrom random import randintfrom bs4 import BeautifulSoupfrom datetime import datetimefrom kafka import KafkaProducerfrom ratelimit import limits, sleep_and_retry


def get_categories(url):    soup = common.get_soup_page(url)
    categories = {}        articles = soup.find_all('article', class_="item bg-gray")
    for article in articles:        anchor = article.find('a')        anchor_title = anchor['title'].split(' | ')[0]        categories[anchor_title] = anchor['href']
    return categories

def get_product_details(prod_url, config, kafka_producer, anchor_title):    try:        soup = common.get_soup_page(prod_url)        if soup is None:            common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)            return            
        prod_detail = soup.find('article')        if prod_detail is None:            common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)            return                    title = prod_detail.find('h1', {'itemprop': 'name'})        image = prod_detail.find('a', class_='image-click').find('img')        description = prod_detail.find('div', {'itemprop': 'description'})        ean_code = prod_detail.find('td', {'itemprop': 'eancode'})        sku = prod_detail.find('span', {'itemprop': 'sku'})        brand = prod_detail.find('span', {'itemprop': 'brand'})        price_detail = prod_detail.find('span', {'itemprop': 'price'})
        product = {}        price = {}        if title is not None:            product['title'] = title.text.strip()        product['url'] = prod_url        product['image_url'] = image['src']        if description is not None:            product['info'] = description.text.strip()        if sku is not None:            product['sku_code'] = sku.text.strip()        if ean_code is not None:            product['ean_code'] = ean_code.text.strip()        if brand is not None:            product['brand'] = brand.text.strip()
        price['price'] = price_detail.text.split()[1]        price['promo'] = 0 if soup.find("del") is None else 1
        common.add_product(kafka_producer, config['brouwland']['name'], product, price)    except Exception as err:        print(traceback.format_exc())        common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc())

def next_url(url_list):    if len(url_list) == 0:        return None
    key = next(iter(url_list))    url = url_list[key]    del url_list[key]    return url

maxthreads = 5sema = threading.Semaphore(value=maxthreads)threads = list()
if __name__ == "__main__" :    config = common.get_config()
    producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])    common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']})    common.clear_failed_product(config['brouwland']['name'])
    all_categories = get_categories(config['brouwland']['catalogue_url'])
    process_categories = all_categories.copy()    url = next_url(process_categories)    products = {}    i = 0    while url is not None:        i = i + 1        print(f'{i}/{len(all_categories)} - {len(process_categories)}')        url = f'{config["brouwland"]["url"]}{url}'        soup = common.get_soup_page(url)        if soup is None:            url = next_url(process_categories)            continue        page_products = soup.find_all("article", class_="product")        extra_cat = soup.find_all("article", class_="item bg-gray")
        if len(extra_cat) > 0:            for cat in extra_cat:                anchor = cat.find('a')                if anchor['href'] not in all_categories.values():                    anchor_title = anchor['title'].split(' | ')[0]                    process_categories[anchor_title] = f'{anchor["href"]}'                    all_categories[anchor_title] = anchor['href']                    #print(f'added {cat["data-href"]}')
        for product in page_products:            anchor = product.find("a")            if anchor['href'] not in products.values():                anchor_title = anchor['title'].split(' | ')[0]                if anchor_title.upper().find("CADEAU") > -1:                    continue                products[anchor_title] = anchor["href"]
                thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title))                threads.append(thread)                thread.start()
        next_page = soup.find("a", class_="next")        if next_page is None:            url = next_url(process_categories)        else:            url = next_page["href"]        for t in threads:        t.join()
    common.update_store_prices(producer, config['brouwland'])