commit 58a04a17984a450dce83e6d733383fc22e551eec Author: Steven Kuterna Date: Thu May 6 15:09:20 2021 +0100 first commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/cheap_shopper.db b/cheap_shopper.db new file mode 100644 index 0000000..74e868e Binary files /dev/null and b/cheap_shopper.db differ diff --git a/cheap_shopper.yaml b/cheap_shopper.yaml new file mode 100644 index 0000000..a7a7a0d --- /dev/null +++ b/cheap_shopper.yaml @@ -0,0 +1,54 @@ +config: + loglevel: logging.DEBUG + log_sql: False + sqlitedb: cheap_shopper.db + active_stores: action, fun, brico + kafka_boostrap_servers: apollo.home.lan +action: + name: Action + url: https://www.action.com + logo: https://upload.wikimedia.org/wikipedia/commons/4/45/Action_Nederland_Logo_2020.svg + categories_url: https://www.action.com/api/navigation/categories?language=nl-BE + products_url: https://www.action.com/api/subcategories/products/?routeSegment=__url__&offset=__offset__&limit=__limit__ + products_cookie: {'epslanguage': 'nl-BE'} +brico: + name: Brico + url: https://www.brico.be + logo: https://vdm.bricocdn.be/logos/brico.svg?ie + cookie: {'language': 'nl'} +brouwland: + name: Brouwland + url: https://www.brouwland.com + logo: https://brouwlandprod-yappa.netdna-ssl.com/build/images/logo-brouwland2.2fc01423.png + catalogue_url: https://www.brouwland.com/nl/onze-producten +dreamland: + name: Dreamland + url: https://www.dreamland.be/e/nl/dl + logo: https://seeklogo.com/images/D/dreamland-be-logo-7F2C0508F2-seeklogo.com.png +delhaize: + name: Delhaize + url: https://delhaize.be/shop + logo: +fun: + name: Fun + url: https://www.fun.be + logo: https://e7.pngegg.com/pngimages/935/634/png-clipart-fun-logo-fun-toyshop-logo-icons-logos-emojis-toy-shop-logos.png + categories_url: https://www.fun.be/seall/menu/index +gamma: + name: Gamma + url: https://www.gamma.be + logo: https://nl.wikipedia.org/wiki/Gamma_(winkel)#/media/Bestand:Gamma_logo_2010.png + categories_url: https://www.gamma.be/nl/resources/menu/categories +hubo: + name: Hubo + url: https://www.hubo.be + logo: https://www.hubo.be/content/dam/hubo/site-afbeeldingen/hubo-algemeen/logo.svg + categories_url: https://www.hubo.be/nl/a.html + products_url: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&limit=60 + products_url_offset: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&offset=__OFFSET__ +intratuin_nl: + name: Intratuin NL + url: https://www.intratuin.nl +intratuin_be: + name: Intratuin BE + url: https://www.intratuin.be \ No newline at end of file diff --git a/common.py b/common.py new file mode 100644 index 0000000..11f6f6e --- /dev/null +++ b/common.py @@ -0,0 +1,152 @@ +import yaml +import json +import os +import shutil +import traceback +import requests +import re + +from bs4 import BeautifulSoup +from ratelimit import limits, sleep_and_retry +from time import sleep +from random import randint +from datetime import datetime + + +def get_config(): + with open('cheap_shopper.yaml', 'r') as ymlfile: + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) + + return cfg + + +def add_product(kafka_producer, store, product, price): + db_object = {} + db_object['type'] = 'product' + db_object['store'] = store + db_object['product'] = product + db_object['price'] = price + + db_object_json = json.dumps(db_object) + db_object_bytes = bytearray(db_object_json, 'utf-8') + + send_kafka_message(kafka_producer, db_object_bytes) + + +def insert_update_store(kafka_producer, store): + db_object = {} + db_object['type'] = 'store' + db_object['store'] = store + + db_object_json = json.dumps(db_object) + db_object_bytes = bytearray(db_object_json, 'utf-8') + + send_kafka_message(kafka_producer, db_object_bytes) + + +def send_kafka_message(kafka_producer, message): + kafka_producer.send('shopper_db', message) + + +def update_store_prices(kafka_producer, config): + insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')}) + + db_object = {} + db_object['type'] = 'store_update' + db_object['store'] = config['name'] + + db_object_json = json.dumps(db_object) + db_object_bytes = bytearray(db_object_json, 'utf-8') + send_kafka_message(kafka_producer, db_object_bytes) + print(f'updating prices {db_object_json}') + + +def clear_failed_product(store): + if not os.path.exists('failed'): + os.mkdir('failed') + if not os.path.exists(f'failed/{store}'): + os.mkdir(f'failed/{store}') + return + else: + for dir in os.listdir(f'failed/{store}/'): + shutil.rmtree(f'failed/{store}/{dir}') + + +def dump_failed_product(store, prod_url, page, err, trace): + if not os.path.exists('failed'): + os.mkdir('failed') + if not os.path.exists(f'failed/{store}'): + os.mkdir(f'failed/{store}') + dirname = prod_url.replace('https://','') + dirname = dirname.replace('.','-') + dirname = dirname.replace('/','_') + os.mkdir(f'failed/{store}/{dirname}') + err_file = open(f'failed/{store}/{dirname}/error.txt', "w") + err_file.write(f'{prod_url}\n') + err_file.write('===========================================\n') + err_file.write(f'{str(err)}\n') + err_file.write('===========================================\n') + err_file.write(str(trace)) + err_file.close() + page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8") + page_file.write(str(page)) + page_file.close() + +def get_proxies(): + page = requests.get("https://free-proxy-list.net/") + soup = BeautifulSoup(page.content, "html.parser") + raw_div = soup.find("div", {"id": "raw"}) + raw_textarea = raw_div.find("textarea") + ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text) + ip_dict = {} + for ip in ip_list: + ip_addr = ip.split(":")[0] + ip_port = ip.split(":")[1] + ip_dict[ip_addr] = ip_port + return ip_list + + +@sleep_and_retry +@limits(calls=2, period=1) +def get_soup_page(url, cookie=None): + #print(f'get page for soup: {url}') + sleep(randint(1,2)) + soup = None + try: + headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'} + #page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)}) + page = requests.get(url, cookies=cookie, headers=headers) + if page.status_code != 200 and page.status_code != 301: + return None + soup = BeautifulSoup(page.content, "html.parser") + except ConnectionRefusedError: + print(traceback.format_exc()) + sleep(randint(2,3)) + soup = get_soup_page(url) + except Exception as err: + print(traceback.format_exc()) + print(err) + + return soup + +def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}): + #print(f'get page for soup: {url}') + sleep(randint(1,2)) + soup = None + try: + if len(headers) == 0: + headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', + 'content-type':'text/html;charset=UTF-8'} + if 'user-agent' not in headers: + headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' + headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51' + #page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)}) + page = requests.get(url, cookies=cookie, data=payload, headers=headers) + #print(page.content) + if page.status_code != 200 and page.status_code != 301: + return None + soup = BeautifulSoup(page.content, "html.parser") + except Exception as err: + print(traceback.format_exc()) + print(err) + return soup diff --git a/db_consumer.py b/db_consumer.py new file mode 100644 index 0000000..9abe3ce --- /dev/null +++ b/db_consumer.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 + +import sqlite3 +import common +import json +import time +import threading + +from datetime import datetime +from kafka import KafkaConsumer + + +def committer(connection): + while True: + connection.commit() + time.sleep(10) + + +def insert_update_store(connection, stores, value_json): + cursor = connection.cursor() + + columns = '(' + values_sql = f'(' + values = [] + update_sql = '' + update_values = [] + + for dict_key in value_json['store']: + columns = f'{columns}{dict_key}, ' + values.append(value_json['store'][dict_key]) + values_sql = f'{values_sql}?, ' + if dict_key != 'store': + update_sql = f'{update_sql}{dict_key}=?, ' + update_values.append(value_json['store'][dict_key]) + + columns = columns[:-2] + ')' + values_sql = values_sql[:-2] + ')' + update_sql = update_sql[:-2] + update_values.append(value_json["store"]["store"]) + + try: + sql_statement = f'INSERT INTO store {columns} VALUES {values_sql}' + cursor.execute(sql_statement, values) + stores[value_json["store"]["store"]] = cursor.lastrowid + except sqlite3.IntegrityError as err: + try: + sql_statement = f'UPDATE store SET {update_sql} WHERE store=?' + cursor.execute(sql_statement, tuple(update_values)) + except Exception as err: + print(err) + except Exception as err: + print(err) + + +def insert_update_product(connection, stores, value_json): + cursor = connection.cursor() + + columns = '(store, ' + values_sql = f'(?, ' + values = [stores[value_json["store"]]] + update_sql = '' + update_values = [] + + for dict_key in value_json['product']: + columns = f'{columns}{dict_key}, ' + values.append(value_json['product'][dict_key]) + values_sql = f'{values_sql}?, ' + if dict_key != 'sku_code': + update_sql = f'{update_sql}{dict_key}=?, ' + update_values.append(value_json['product'][dict_key]) + + columns = columns[:-2] + ')' + values_sql = values_sql[:-2] + ')' + update_sql = update_sql[:-2] + update_values.append(stores[value_json["store"]]) + update_values.append(value_json["product"]["sku_code"]) + + product_id = None + + try: + sql_statement = f'INSERT INTO products {columns} VALUES {values_sql}' + cursor.execute(sql_statement, values) + product_id = cursor.lastrowid + #print(f'inserted {product_id}') + except sqlite3.IntegrityError as err: + try: + sql_statement = f'UPDATE products SET {update_sql} WHERE store=? and sku_code=?' + cursor.execute(sql_statement, tuple(update_values)) + sql_statement = f'SELECT id FROM products WHERE store=? and sku_code=?' + cursor.execute(sql_statement, (stores[value_json["store"]], value_json["product"]["sku_code"])) + product_id = cursor.fetchone()[0] + #print(f'updated {product_id}') + except Exception as err: + print(err) + except Exception as err: + print(err) + + insert_update_price(connection, stores, value_json, product_id) + +def insert_update_price(connection, stores, value_json, product_id): + cursor = connection.cursor() + + columns = '(product_id, last_update, ' + values_sql = f'(?, ?, ' + values = [product_id, datetime.now().strftime('%d/%m/%Y')] + + for dict_key in value_json['price']: + columns = f'{columns}{dict_key}, ' + values.append(value_json['price'][dict_key]) + values_sql = f'{values_sql}?, ' + + columns = columns[:-2] + ')' + values_sql = values_sql[:-2] + ')' + + price_exists = False + try: + sql_statement = f'SELECT id FROM price WHERE product_id=? and price=? and active=1' + cursor.execute(sql_statement, (product_id, value_json["price"]["price"])) + if cursor.fetchone(): + price_exists = True + except Exception as err: + print(err) + try: + if not price_exists: + sql_statement = f'INSERT INTO price {columns} VALUES {values_sql}' + cursor.execute(sql_statement, values) + else: + sql_statement = f'UPDATE price SET last_update=? WHERE product_id=? and price=? and active=1' + cursor.execute(sql_statement, (datetime.now().strftime("%d/%m/%Y"), product_id, value_json["price"]["price"])) + except sqlite3.IntegrityError as err: + print(err) + except Exception as err: + print(err) + + +def deactivate_old_price(connection, stores, value_json): + cursor = connection.cursor() + try: + sql_statement=f'UPDATE price SET active=0 WHERE product_id IN (SELECT id FROM products WHERE store=?) AND last_update= 0: + print(f'remove {anchor["href"]}') + pass + elif anchor['href'] not in categories.values(): + if anchor['href'].find('https://www.brico.be') == -1: + anchor['href'] = f'https://www.brico.be{anchor["href"]}' + categories[anchor.text] = anchor['href'] + + return categories + + +def get_product_details(prod_url, config, kafka_producer, failed): + try: + soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie']) + + prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout') + if prod_detail is None: + return + prod_specs = soup.find('div', {'id': 'specs'}) + title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1') + + product = {} + price = {} + product['title'] = title.find("span", {'itemprop': 'name'}).text + product['url'] = prod_url + product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src'] + product['info'] = prod_specs.find_all("p")[0].text + product['sku_code'] = prod_url.split("/")[-1] + ean_pattern = "([0-9]{8,13})" + if len(prod_specs.find_all("p")) > 1: + ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text) + product['ean_code'] = ean_match.group(1) + + pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)' + pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"' + scripts = soup.find_all("script") + for script in scripts: + if script.string is not None: + if script.string.find('"brands"') >= 0: + match = re.search(pattern_brand, script.string) + product['brand'] = match.group(1) + if script.string.find('"ean"') >= 0 and 'ean_code' not in product: + match = re.search(pattern_ean, script.string) + product['ean_code'] = match.group(1) + if soup.find("ins") is None: + return + price['price'] = soup.find("ins").find("meta")['content'] + price['promo'] = 0 if soup.find("del") is None else 1 + + common.add_product(kafka_producer, config['brico']['name'], product, price) + except Exception as err: + common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc()) + + +def next_url(url_list): + if len(url_list) == 0: + return None + + key = next(iter(url_list)) + url = url_list[key] + del url_list[key] + return url + + +maxthreads = 10 +sema = threading.Semaphore(value=maxthreads) +threads = list() + +if __name__ == "__main__" : + config = common.get_config() + + producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) + common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']}) + common.clear_failed_product(config['brico']['name']) + + all_categories = get_categories(config['brico']['url'], config['brico']['cookie']) + + process_categories = all_categories.copy() + url = next_url(process_categories) + products = {} + failed_prod = {} + i = 0 + while url is not None: + i = i + 1 + print(f'{i}/{len(all_categories)} - {len(process_categories)}') + + soup = common.get_soup_page_no_limit(url, config['brico']['cookie']) + page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive") + extra_cat = soup.find_all("a", class_="mxd-product-list-link") + + if len(extra_cat) > 0: + for cat in extra_cat: + if cat['href'].find('bricosolar') >= 0: + print(f'main remove {cat["href"]}') + pass + elif cat['href'] not in all_categories.values(): + cat_name = cat.find("span").text + if cat['href'].find("/") > 0: + cat['href'] = f'/{cat["href"]}' + process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}' + #print(f'added {config["brico"]["url"]}{cat["href"]}') + all_categories[cat_name] = cat['href'] + + for product in page_products: + product = product.find("a", class_="mxd-block-card-link") + if product['href'].find("/nl") == -1: + product['href'] = f'/nl{product["href"]}' + if product['href'].find("/") > 0: + product['href'] = f'/{product["href"]}' + if product['href'] not in products.values(): + prod_name = product.find("span", {"itemprop": "name"}).text + #print(f'product url: {product["href"]}') + products[prod_name] = product["href"] + + thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod)) + threads.append(thread) + thread.start() + + next_page = soup.find("a", {"rel": "next"}) + if next_page is None: + url = next_url(process_categories) + else: + base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"}) + url = f'{base["href"]}{next_page["href"]}' + print(f'next page: {url}') + + + for t in threads: + t.join() + + common.update_store_prices(producer, config['brico']) \ No newline at end of file diff --git a/parser_brouwland.py b/parser_brouwland.py new file mode 100644 index 0000000..497e139 --- /dev/null +++ b/parser_brouwland.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +import requests +import logging +import xml.etree.ElementTree as ET +import json +import unidecode +import sqlite3 +import common +import threading +import re +import traceback +import sys +import itertools +import logging +#logging.basicConfig(level=logging.DEBUG) + +from time import sleep +from random import randint +from bs4 import BeautifulSoup +from datetime import datetime +from kafka import KafkaProducer +from ratelimit import limits, sleep_and_retry + + + +def get_categories(url): + soup = common.get_soup_page(url) + + categories = {} + + articles = soup.find_all('article', class_="item bg-gray") + + for article in articles: + anchor = article.find('a') + anchor_title = anchor['title'].split(' | ')[0] + categories[anchor_title] = anchor['href'] + + return categories + + +def get_product_details(prod_url, config, kafka_producer, anchor_title): + try: + soup = common.get_soup_page(prod_url) + if soup is None: + common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None) + return + + prod_detail = soup.find('article') + if prod_detail is None: + common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None) + return + title = prod_detail.find('h1', {'itemprop': 'name'}) + image = prod_detail.find('a', class_='image-click').find('img') + description = prod_detail.find('div', {'itemprop': 'description'}) + ean_code = prod_detail.find('td', {'itemprop': 'eancode'}) + sku = prod_detail.find('span', {'itemprop': 'sku'}) + brand = prod_detail.find('span', {'itemprop': 'brand'}) + price_detail = prod_detail.find('span', {'itemprop': 'price'}) + + product = {} + price = {} + if title is not None: + product['title'] = title.text.strip() + product['url'] = prod_url + product['image_url'] = image['src'] + if description is not None: + product['info'] = description.text.strip() + if sku is not None: + product['sku_code'] = sku.text.strip() + if ean_code is not None: + product['ean_code'] = ean_code.text.strip() + if brand is not None: + product['brand'] = brand.text.strip() + + price['price'] = price_detail.text.split()[1] + price['promo'] = 0 if soup.find("del") is None else 1 + + common.add_product(kafka_producer, config['brouwland']['name'], product, price) + except Exception as err: + print(traceback.format_exc()) + common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc()) + + +def next_url(url_list): + if len(url_list) == 0: + return None + + key = next(iter(url_list)) + url = url_list[key] + del url_list[key] + return url + + +maxthreads = 5 +sema = threading.Semaphore(value=maxthreads) +threads = list() + +if __name__ == "__main__" : + config = common.get_config() + + producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) + common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']}) + common.clear_failed_product(config['brouwland']['name']) + + all_categories = get_categories(config['brouwland']['catalogue_url']) + + process_categories = all_categories.copy() + url = next_url(process_categories) + products = {} + i = 0 + while url is not None: + i = i + 1 + print(f'{i}/{len(all_categories)} - {len(process_categories)}') + url = f'{config["brouwland"]["url"]}{url}' + soup = common.get_soup_page(url) + if soup is None: + url = next_url(process_categories) + continue + page_products = soup.find_all("article", class_="product") + extra_cat = soup.find_all("article", class_="item bg-gray") + + if len(extra_cat) > 0: + for cat in extra_cat: + anchor = cat.find('a') + if anchor['href'] not in all_categories.values(): + anchor_title = anchor['title'].split(' | ')[0] + process_categories[anchor_title] = f'{anchor["href"]}' + all_categories[anchor_title] = anchor['href'] + #print(f'added {cat["data-href"]}') + + for product in page_products: + anchor = product.find("a") + if anchor['href'] not in products.values(): + anchor_title = anchor['title'].split(' | ')[0] + if anchor_title.upper().find("CADEAU") > -1: + continue + products[anchor_title] = anchor["href"] + + thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title)) + threads.append(thread) + thread.start() + + next_page = soup.find("a", class_="next") + if next_page is None: + url = next_url(process_categories) + else: + url = next_page["href"] + + for t in threads: + t.join() + + common.update_store_prices(producer, config['brouwland']) diff --git a/parser_dreamland.py b/parser_dreamland.py new file mode 100644 index 0000000..c979101 --- /dev/null +++ b/parser_dreamland.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 + +import requests +import logging +import xml.etree.ElementTree as ET +import json +import unidecode +import sqlite3 +import common +import threading +import re +import traceback +import sys + +from bs4 import BeautifulSoup +from datetime import datetime +from kafka import KafkaProducer +from ratelimit import limits, sleep_and_retry + + +def get_categories(url): + soup = dreamland_get_soup(url) + + categories = {} + + main_categories = soup.find_all("p", class_="subNav__categoryTitle") + for cat in main_categories: + anchor = cat.find("a") + categories[anchor.text] = anchor['href'] + + return categories + + +@sleep_and_retry +@limits(calls=1, period=2) +def dreamland_get_soup(url, payload=None, cookie=None, headers={}): + return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers) + + +def get_product_details(prod_url, config, kafka_producer): + try: + soup = dreamland_get_soup(prod_url) + + product = {} + price = {} + sku = soup.find("span", class_="sku").text.split(":")[1] + title = soup.find("h1", class_="main_header").text + image = soup.find("img", {'id': 'productMainImage'})['src'] + desc = soup.find("div", class_="product_text").text + attrs = soup.find("div", {'id': 'Attributes_table'}) + items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')}) + ean_list = [] + for item in items: + if item.text.find('Merk') > -1: + product['brand'] = item.find("span").text.strip('">').strip() + if item.text.find('EAN') > -1: + ean_codes= item.find_all("span") + for code in ean_codes: + ean = code.text.strip('">').strip() + ean_list.append(ean) + + product['sku_code'] = sku + product['url'] = prod_url + product['title'] = title + product['image_url'] = image + product['info'] = desc + product['ean_code'] = ", ".join(ean_list) + + + if soup.find("div", class_="price red mini") is None: + price['price'] = soup.find("div", class_="product_price").text.strip("€\xa0") + price['promo'] = 0 + else: + price['price'] = soup.find("div", class_="price red mini").text.strip("€\xa0") + price['promo'] = 1 + + common.add_product(kafka_producer, config['dreamland']['name'], product, price) + except Exception as err: + common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc()) + + +def get_productListView(soup): + scripts = soup.find_all("script") + list_view_pattern = "\'(ProductListingView.*)',{" + listview = None + if scripts is None: + return listview + for script in scripts: + if script.string is not None: + if script.string.find("ProductListingView") > 0: + listview_match = re.search(list_view_pattern, script.string) + listview = listview_match.group(1) + return listview + + +def next_url(url_list): + if len(url_list) == 0: + return None + + key = next(iter(url_list)) + url = url_list[key] + del url_list[key] + return url + + +def get_dreamland_productListingView(url, index=0): + soup = None + #payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax' + headers = {} + headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' + headers['content-type'] = 'application/x-www-form-urlencoded' + payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax' + + soup = dreamland_get_soup(url, payload=payload, headers=headers) + + return soup + + +maxthreads = 2 +sema = threading.Semaphore(value=maxthreads) +threads = list() + + +if __name__ == "__main__": + config = common.get_config() + + producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) + common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']}) + common.clear_failed_product(config['dreamland']['name']) + + all_categories = get_categories(config['dreamland']['url']) + + process_categories = all_categories.copy() + url = next_url(process_categories) + products = {} + i = 0 + while url is not None: + i = i + 1 + print(f'{i}/{len(all_categories)} - {len(process_categories)}') + soup = common.get_soup_page(url) + if soup is None: + url = next_url(process_categories) + continue + + product_listview = get_productListView(soup) + extra_cat = soup.find_all("li", class_="singleFacet") + + if len(extra_cat) > 0: + for cat in extra_cat: + anchor = cat.find("a") + if anchor['href'] not in all_categories.values(): + anchor_title = anchor.find("span", class_="facetName").text + #print(f'added {anchor_title}- {anchor["href"]}') + process_categories[anchor_title] = anchor["href"] + all_categories[anchor_title] = anchor['href'] + index = 0 + while product_listview is not None: + view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index) + #view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}') + view_products = view_soup.find_all("div", class_="product_info") + for product in view_products: + index = index + 1 + name = product.find("div", class_="product_name") + anchor = product.find("a") + if anchor['href'] not in products.values(): + products[name] = anchor["href"] + + thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer)) + threads.append(thread) + thread.start() + next_arrow = view_soup.find("a", class_="right_arrow") + if next_arrow is None: + product_listview = None + + url = next_url(process_categories) + + for t in threads: + t.join() + + common.update_store_prices(producer, config['dreamland']) \ No newline at end of file diff --git a/parser_fun.py b/parser_fun.py new file mode 100644 index 0000000..ae75518 --- /dev/null +++ b/parser_fun.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 + +import requests +import logging +import xml.etree.ElementTree as ET +import json +import unidecode +import sqlite3 +import common +import threading +import traceback + +from bs4 import BeautifulSoup +from datetime import datetime +from kafka import KafkaProducer + + +def get_categories(cat_url): + page = requests.get(cat_url) + content = page.content[1:-1].decode('utf-8') + content = content.replace("\\r\\n", "") + content = content.replace('\\/', '/') + content = content.replace('\\"', '"') + content = content.replace(' & ', '&') + root = ET.fromstring(content) + + categories = {} + for elm in list(root.iter()): + if elm.tag == "a": + if (elm.attrib['href'] == '#') or (elm.text is None): + continue + if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0): + continue + if elm.attrib['href'] not in categories.values(): + categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "") + return categories + + +def next_url(url_list): + if len(url_list) == 0: + return None + + key = next(iter(url_list)) + url = url_list[key] + del url_list[key] + return url + + +def get_product_details(prod_url, config, kafka_producer): + soup = common.get_soup_page_no_limit(prod_url) + if soup is None: + return + + try: + prod_view = soup.find("div", class_="product-view") + prod_essential = prod_view.find("div", class_="product-essential") + + image = prod_view.find("img", {'itemprop': 'image'})['src'] + + price = prod_essential.find("meta", {'itemprop': 'price'})['content'] + + special_price = prod_essential.find("p", class_="special-price") + promo = False + promo_end = None + if special_price is not None: + promo = True + promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"}) + if promo_end is not None: + promo_end = promo_end['content'] + + title = prod_view.find("h1", {'itemprop': 'name'}).text + sku = prod_view.find("meta", {'itemprop': 'sku'})['content'] + brand = prod_view.find("meta", {'itemprop': 'brand'})['content'] + description = prod_view.find("meta", {'itemprop': 'description'})['content'] + + specs = prod_view.find("div", class_="tab-content tab-block-additional") + spec_li = specs.find_all("li") + info = '' + for spec in spec_li: + label = spec.find("span", class_="label").text + content = spec.find("span", class_="data").text + if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1: + info = f'{info}{label}: {content} / ' + info = info[:-3] + + ean_code = '' + ean_list = prod_view.find_all("li") + for elm in ean_list: + if elm is not None: + if elm.text.upper().find("EAN") >= 0: + ean_code = elm.find("span", class_="data").text + + product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info} + price_details = {'price': price, "promo": promo, "promo_end": promo_end} + + common.add_product(kafka_producer, config['fun']['name'], product_details, price_details) + except Exception as err: + common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc()) + + +maxthreads = 10 +sema = threading.Semaphore(value=maxthreads) +threads = list() + +if __name__ == "__main__": + config = common.get_config() + + producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) + common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']}) + common.clear_failed_product(config['fun']['name']) + + all_categories = get_categories(config['fun']['categories_url']) + + process_categories = all_categories.copy() + url = next_url(process_categories) + products = {} + i = 0 + while url is not None: + i = i + 1 + print(f'{i}/{len(all_categories)} - {len(process_categories)}') + cat_url = config["fun"]["url"] + url + + soup = common.get_soup_page_no_limit(cat_url) + subcat = soup.find_all("li", class_="item last") + + if len(subcat) == 0: + big_urls = soup.find_all("big") + for b_url in big_urls: + b_href = b_url.find("a") + if b_href: + try: + b_url = b_href['href'].replace(config["fun"]["url"], "") + if b_url not in all_categories.values(): + process_categories[b_href.text] = b_url + all_categories[b_href.text] = b_url + except Exception as err: + print(url) + print("+++++++++++++") + print(cat_url) + print("+++++++++++++") + print(b_href) + print("+++++++++++++") + print(err) + print("=============") + url = next_url(process_categories) + else: + for sc in subcat: + product = sc.find("h2", class_="product-name") + p_info = product.find("a") + if p_info['href'] not in products.values(): + products[p_info['title']] = p_info['href'] + + thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer)) + threads.append(thread) + thread.start() + + next_page = soup.find("a", class_="next i-next") + if next_page is None: + url = next_url(process_categories) + else: + url = next_page['href'].replace(config["fun"]["url"], "") + + for t in threads: + t.join() + + common.update_store_prices(producer, config['fun']) \ No newline at end of file diff --git a/parser_hubo.py b/parser_hubo.py new file mode 100644 index 0000000..b3469cf --- /dev/null +++ b/parser_hubo.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +import requests +import logging +import xml.etree.ElementTree as ET +import json +import unidecode +import sqlite3 +import common +import threading +import re +import traceback +import sys + +from bs4 import BeautifulSoup +from datetime import datetime +from kafka import KafkaProducer + + +def get_product_details(prod_url, config, kafka_producer, json_product): + try: + soup = common.get_soup_page_no_limit(prod_url) + + description = soup.find('div', class_='pdp-description__content') + ean_row = soup.find_all('div', class_='row border-bottom') + + product = {} + price = {} + product['title'] = json_product['title'] + product['url'] = prod_url + if 'image' in json_product: + product['image_url'] = json_product['image'] + if description.find("p") is not None: + product['info'] = description.find("p").text + product['sku_code'] = json_product['sku'] + for row in ean_row: + if row.text.find('EAN- / barcode') > -1: + ean_pattern = "([0-9]{8,13})" + ean_match = re.search(ean_pattern, row.text) + product['ean_code'] = ean_match.group(1) + if 'brand' in json_product: + product['brand'] = json_product['brand'] + + price['price'] = json_product['price'] + if 'discount' in json_product: + price['promo'] = 1 + else: + price['promo'] = 0 + + common.add_product(kafka_producer, config['hubo']['name'], product, price) + except Exception as err: + common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc()) + + +def next_url(url_list): + if len(url_list) == 0: + return None + + key = next(iter(url_list)) + url = url_list[key] + del url_list[key] + return url + + +maxthreads = 5 +sema = threading.Semaphore(value=maxthreads) +threads = list() + +if __name__ == "__main__" : + config = common.get_config() + + producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) + common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']}) + common.clear_failed_product(config['hubo']['name']) + + get_doc = requests.get(config['hubo']['products_url']) + data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore'))) + + offset = 1 + + while len(data_json['docs']) > 0: + print(f'{offset}/{data_json["doc_count"]}') + for product in data_json['docs']: + thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product)) + threads.append(thread) + thread.start() + + offset = offset + int(data_json['limit']) + get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset))) + data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore'))) + + for t in threads: + t.join() + + common.update_store_prices(producer, config['hubo'])