@ -0,0 +1,54 @@ | |||
config: | |||
loglevel: logging.DEBUG | |||
log_sql: False | |||
sqlitedb: cheap_shopper.db | |||
active_stores: action, fun, brico | |||
kafka_boostrap_servers: apollo.home.lan | |||
action: | |||
name: Action | |||
url: https://www.action.com | |||
logo: https://upload.wikimedia.org/wikipedia/commons/4/45/Action_Nederland_Logo_2020.svg | |||
categories_url: https://www.action.com/api/navigation/categories?language=nl-BE | |||
products_url: https://www.action.com/api/subcategories/products/?routeSegment=__url__&offset=__offset__&limit=__limit__ | |||
products_cookie: {'epslanguage': 'nl-BE'} | |||
brico: | |||
name: Brico | |||
url: https://www.brico.be | |||
logo: https://vdm.bricocdn.be/logos/brico.svg?ie | |||
cookie: {'language': 'nl'} | |||
brouwland: | |||
name: Brouwland | |||
url: https://www.brouwland.com | |||
logo: https://brouwlandprod-yappa.netdna-ssl.com/build/images/logo-brouwland2.2fc01423.png | |||
catalogue_url: https://www.brouwland.com/nl/onze-producten | |||
dreamland: | |||
name: Dreamland | |||
url: https://www.dreamland.be/e/nl/dl | |||
logo: https://seeklogo.com/images/D/dreamland-be-logo-7F2C0508F2-seeklogo.com.png | |||
delhaize: | |||
name: Delhaize | |||
url: https://delhaize.be/shop | |||
logo: | |||
fun: | |||
name: Fun | |||
url: https://www.fun.be | |||
logo: https://e7.pngegg.com/pngimages/935/634/png-clipart-fun-logo-fun-toyshop-logo-icons-logos-emojis-toy-shop-logos.png | |||
categories_url: https://www.fun.be/seall/menu/index | |||
gamma: | |||
name: Gamma | |||
url: https://www.gamma.be | |||
logo: https://nl.wikipedia.org/wiki/Gamma_(winkel)#/media/Bestand:Gamma_logo_2010.png | |||
categories_url: https://www.gamma.be/nl/resources/menu/categories | |||
hubo: | |||
name: Hubo | |||
url: https://www.hubo.be | |||
logo: https://www.hubo.be/content/dam/hubo/site-afbeeldingen/hubo-algemeen/logo.svg | |||
categories_url: https://www.hubo.be/nl/a.html | |||
products_url: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&limit=60 | |||
products_url_offset: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&offset=__OFFSET__ | |||
intratuin_nl: | |||
name: Intratuin NL | |||
url: https://www.intratuin.nl | |||
intratuin_be: | |||
name: Intratuin BE | |||
url: https://www.intratuin.be |
@ -0,0 +1,152 @@ | |||
import yaml | |||
import json | |||
import os | |||
import shutil | |||
import traceback | |||
import requests | |||
import re | |||
from bs4 import BeautifulSoup | |||
from ratelimit import limits, sleep_and_retry | |||
from time import sleep | |||
from random import randint | |||
from datetime import datetime | |||
def get_config(): | |||
with open('cheap_shopper.yaml', 'r') as ymlfile: | |||
cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) | |||
return cfg | |||
def add_product(kafka_producer, store, product, price): | |||
db_object = {} | |||
db_object['type'] = 'product' | |||
db_object['store'] = store | |||
db_object['product'] = product | |||
db_object['price'] = price | |||
db_object_json = json.dumps(db_object) | |||
db_object_bytes = bytearray(db_object_json, 'utf-8') | |||
send_kafka_message(kafka_producer, db_object_bytes) | |||
def insert_update_store(kafka_producer, store): | |||
db_object = {} | |||
db_object['type'] = 'store' | |||
db_object['store'] = store | |||
db_object_json = json.dumps(db_object) | |||
db_object_bytes = bytearray(db_object_json, 'utf-8') | |||
send_kafka_message(kafka_producer, db_object_bytes) | |||
def send_kafka_message(kafka_producer, message): | |||
kafka_producer.send('shopper_db', message) | |||
def update_store_prices(kafka_producer, config): | |||
insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')}) | |||
db_object = {} | |||
db_object['type'] = 'store_update' | |||
db_object['store'] = config['name'] | |||
db_object_json = json.dumps(db_object) | |||
db_object_bytes = bytearray(db_object_json, 'utf-8') | |||
send_kafka_message(kafka_producer, db_object_bytes) | |||
print(f'updating prices {db_object_json}') | |||
def clear_failed_product(store): | |||
if not os.path.exists('failed'): | |||
os.mkdir('failed') | |||
if not os.path.exists(f'failed/{store}'): | |||
os.mkdir(f'failed/{store}') | |||
return | |||
else: | |||
for dir in os.listdir(f'failed/{store}/'): | |||
shutil.rmtree(f'failed/{store}/{dir}') | |||
def dump_failed_product(store, prod_url, page, err, trace): | |||
if not os.path.exists('failed'): | |||
os.mkdir('failed') | |||
if not os.path.exists(f'failed/{store}'): | |||
os.mkdir(f'failed/{store}') | |||
dirname = prod_url.replace('https://','') | |||
dirname = dirname.replace('.','-') | |||
dirname = dirname.replace('/','_') | |||
os.mkdir(f'failed/{store}/{dirname}') | |||
err_file = open(f'failed/{store}/{dirname}/error.txt', "w") | |||
err_file.write(f'{prod_url}\n') | |||
err_file.write('===========================================\n') | |||
err_file.write(f'{str(err)}\n') | |||
err_file.write('===========================================\n') | |||
err_file.write(str(trace)) | |||
err_file.close() | |||
page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8") | |||
page_file.write(str(page)) | |||
page_file.close() | |||
def get_proxies(): | |||
page = requests.get("https://free-proxy-list.net/") | |||
soup = BeautifulSoup(page.content, "html.parser") | |||
raw_div = soup.find("div", {"id": "raw"}) | |||
raw_textarea = raw_div.find("textarea") | |||
ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text) | |||
ip_dict = {} | |||
for ip in ip_list: | |||
ip_addr = ip.split(":")[0] | |||
ip_port = ip.split(":")[1] | |||
ip_dict[ip_addr] = ip_port | |||
return ip_list | |||
@sleep_and_retry | |||
@limits(calls=2, period=1) | |||
def get_soup_page(url, cookie=None): | |||
#print(f'get page for soup: {url}') | |||
sleep(randint(1,2)) | |||
soup = None | |||
try: | |||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'} | |||
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)}) | |||
page = requests.get(url, cookies=cookie, headers=headers) | |||
if page.status_code != 200 and page.status_code != 301: | |||
return None | |||
soup = BeautifulSoup(page.content, "html.parser") | |||
except ConnectionRefusedError: | |||
print(traceback.format_exc()) | |||
sleep(randint(2,3)) | |||
soup = get_soup_page(url) | |||
except Exception as err: | |||
print(traceback.format_exc()) | |||
print(err) | |||
return soup | |||
def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}): | |||
#print(f'get page for soup: {url}') | |||
sleep(randint(1,2)) | |||
soup = None | |||
try: | |||
if len(headers) == 0: | |||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', | |||
'content-type':'text/html;charset=UTF-8'} | |||
if 'user-agent' not in headers: | |||
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' | |||
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51' | |||
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)}) | |||
page = requests.get(url, cookies=cookie, data=payload, headers=headers) | |||
#print(page.content) | |||
if page.status_code != 200 and page.status_code != 301: | |||
return None | |||
soup = BeautifulSoup(page.content, "html.parser") | |||
except Exception as err: | |||
print(traceback.format_exc()) | |||
print(err) | |||
return soup |
@ -0,0 +1,187 @@ | |||
#!/usr/bin/env python3 | |||
import sqlite3 | |||
import common | |||
import json | |||
import time | |||
import threading | |||
from datetime import datetime | |||
from kafka import KafkaConsumer | |||
def committer(connection): | |||
while True: | |||
connection.commit() | |||
time.sleep(10) | |||
def insert_update_store(connection, stores, value_json): | |||
cursor = connection.cursor() | |||
columns = '(' | |||
values_sql = f'(' | |||
values = [] | |||
update_sql = '' | |||
update_values = [] | |||
for dict_key in value_json['store']: | |||
columns = f'{columns}{dict_key}, ' | |||
values.append(value_json['store'][dict_key]) | |||
values_sql = f'{values_sql}?, ' | |||
if dict_key != 'store': | |||
update_sql = f'{update_sql}{dict_key}=?, ' | |||
update_values.append(value_json['store'][dict_key]) | |||
columns = columns[:-2] + ')' | |||
values_sql = values_sql[:-2] + ')' | |||
update_sql = update_sql[:-2] | |||
update_values.append(value_json["store"]["store"]) | |||
try: | |||
sql_statement = f'INSERT INTO store {columns} VALUES {values_sql}' | |||
cursor.execute(sql_statement, values) | |||
stores[value_json["store"]["store"]] = cursor.lastrowid | |||
except sqlite3.IntegrityError as err: | |||
try: | |||
sql_statement = f'UPDATE store SET {update_sql} WHERE store=?' | |||
cursor.execute(sql_statement, tuple(update_values)) | |||
except Exception as err: | |||
print(err) | |||
except Exception as err: | |||
print(err) | |||
def insert_update_product(connection, stores, value_json): | |||
cursor = connection.cursor() | |||
columns = '(store, ' | |||
values_sql = f'(?, ' | |||
values = [stores[value_json["store"]]] | |||
update_sql = '' | |||
update_values = [] | |||
for dict_key in value_json['product']: | |||
columns = f'{columns}{dict_key}, ' | |||
values.append(value_json['product'][dict_key]) | |||
values_sql = f'{values_sql}?, ' | |||
if dict_key != 'sku_code': | |||
update_sql = f'{update_sql}{dict_key}=?, ' | |||
update_values.append(value_json['product'][dict_key]) | |||
columns = columns[:-2] + ')' | |||
values_sql = values_sql[:-2] + ')' | |||
update_sql = update_sql[:-2] | |||
update_values.append(stores[value_json["store"]]) | |||
update_values.append(value_json["product"]["sku_code"]) | |||
product_id = None | |||
try: | |||
sql_statement = f'INSERT INTO products {columns} VALUES {values_sql}' | |||
cursor.execute(sql_statement, values) | |||
product_id = cursor.lastrowid | |||
#print(f'inserted {product_id}') | |||
except sqlite3.IntegrityError as err: | |||
try: | |||
sql_statement = f'UPDATE products SET {update_sql} WHERE store=? and sku_code=?' | |||
cursor.execute(sql_statement, tuple(update_values)) | |||
sql_statement = f'SELECT id FROM products WHERE store=? and sku_code=?' | |||
cursor.execute(sql_statement, (stores[value_json["store"]], value_json["product"]["sku_code"])) | |||
product_id = cursor.fetchone()[0] | |||
#print(f'updated {product_id}') | |||
except Exception as err: | |||
print(err) | |||
except Exception as err: | |||
print(err) | |||
insert_update_price(connection, stores, value_json, product_id) | |||
def insert_update_price(connection, stores, value_json, product_id): | |||
cursor = connection.cursor() | |||
columns = '(product_id, last_update, ' | |||
values_sql = f'(?, ?, ' | |||
values = [product_id, datetime.now().strftime('%d/%m/%Y')] | |||
for dict_key in value_json['price']: | |||
columns = f'{columns}{dict_key}, ' | |||
values.append(value_json['price'][dict_key]) | |||
values_sql = f'{values_sql}?, ' | |||
columns = columns[:-2] + ')' | |||
values_sql = values_sql[:-2] + ')' | |||
price_exists = False | |||
try: | |||
sql_statement = f'SELECT id FROM price WHERE product_id=? and price=? and active=1' | |||
cursor.execute(sql_statement, (product_id, value_json["price"]["price"])) | |||
if cursor.fetchone(): | |||
price_exists = True | |||
except Exception as err: | |||
print(err) | |||
try: | |||
if not price_exists: | |||
sql_statement = f'INSERT INTO price {columns} VALUES {values_sql}' | |||
cursor.execute(sql_statement, values) | |||
else: | |||
sql_statement = f'UPDATE price SET last_update=? WHERE product_id=? and price=? and active=1' | |||
cursor.execute(sql_statement, (datetime.now().strftime("%d/%m/%Y"), product_id, value_json["price"]["price"])) | |||
except sqlite3.IntegrityError as err: | |||
print(err) | |||
except Exception as err: | |||
print(err) | |||
def deactivate_old_price(connection, stores, value_json): | |||
cursor = connection.cursor() | |||
try: | |||
sql_statement=f'UPDATE price SET active=0 WHERE product_id IN (SELECT id FROM products WHERE store=?) AND last_update<?' | |||
cursor.execute(sql_statement, (stores[value_json["store"]], datetime.now().strftime("%d/%m/%Y"))) | |||
#print(sql_statement) | |||
except Exception as err: | |||
print(err) | |||
def insert(connection, stores, value_json): | |||
if value_json['type'] == 'store': | |||
print('insert_update_store') | |||
insert_update_store(connection, stores, value_json) | |||
if value_json['type'] == 'store_update': | |||
print('deactivate_old_price') | |||
deactivate_old_price(connection, stores, value_json) | |||
if value_json['type'] == 'product': | |||
#print('insert_update_product') | |||
insert_update_product(connection, stores, value_json) | |||
def get_stores(connection): | |||
stores = {} | |||
cursor = con.cursor() | |||
sql_statement = 'SELECT store, id FROM store' | |||
cursor.execute(sql_statement) | |||
result = cursor.fetchall() | |||
for res in result: | |||
stores[res[0]] = res[1] | |||
return stores | |||
if __name__ == '__main__': | |||
config = common.get_config() | |||
con = sqlite3.connect(config['config']['sqlitedb'], check_same_thread=False) | |||
if config['config']['log_sql']: | |||
con.set_trace_callback(print) | |||
stores = get_stores(con) | |||
commit_thread = threading.Thread(target=committer, args=(con,), daemon=True) | |||
commit_thread.start() | |||
consumer = KafkaConsumer('shopper_db',bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||
for msg in consumer: | |||
value_json = json.loads(msg.value.decode("utf-8")) | |||
insert(con, stores, value_json) |
@ -0,0 +1,89 @@ | |||
#!/usr/bin/env python3 | |||
import requests | |||
import logging | |||
import xml.etree.ElementTree as ET | |||
import json | |||
import unidecode | |||
import sqlite3 | |||
import common | |||
import traceback | |||
import sys | |||
from bs4 import BeautifulSoup | |||
from datetime import datetime | |||
from kafka import KafkaProducer | |||
def get_categories(url): | |||
page = requests.get(url) | |||
root = ET.fromstring(page.content) | |||
categories = {} | |||
for cat_elm in root.findall("./Data/CategoryNavigationViewModel"): | |||
cat_label = cat_elm.find("Label") | |||
cat_url = cat_elm.find("Url") | |||
categories[cat_label.text] = cat_url.text | |||
return categories | |||
def get_products(config, cat_url, kafka_producer): | |||
url = config['action']['products_url'] | |||
url = url.replace("__url__", cat_url) | |||
url = url.replace("__offset__", "0") | |||
total_url = url.replace("__limit__", "1") | |||
products_cookie = config['action']['products_cookie'] | |||
get_total = requests.get(total_url, cookies=products_cookie) | |||
data_json = json.loads(unidecode.unidecode(get_total.content.decode('utf-8', 'ignore'))) | |||
totalCount = data_json['totalCount'] | |||
all_url = url.replace("__limit__", str(totalCount)) | |||
products = requests.get(all_url, cookies=products_cookie) | |||
products_json = json.loads(unidecode.unidecode(products.content.decode('utf-8', 'ignore'))) | |||
for product_json in products_json['items']: | |||
try: | |||
if product_json['type'] != 'product': | |||
continue | |||
product = {} | |||
price = {} | |||
product['title'] = product_json['title'] | |||
product['url'] = config['action']['url'] + product_json['url'] | |||
product['image_url'] = config['action']['url'] + product_json['imageUrl'] | |||
product['info'] = product_json['subTitle'] | |||
product['sku_code'] = product_json['code'] | |||
for spec in product_json['specifications']: | |||
if spec['id'] == 'attEANCodeVariant': | |||
product['ean_code'] = spec['value'] | |||
if spec['id'] == 'attLongDescription': | |||
product['description'] = spec['value'] | |||
product['brand'] = product_json['brandName'] | |||
price['price'] = product_json['price'] | |||
price['promo'] = 0 if product_json['isDeal'] == False else 1 | |||
price['promo_start'] = product_json['dealStartDate'] | |||
price['promo_end'] = product_json['dealEndDate'] | |||
common.add_product(kafka_producer, config['action']['name'], product, price) | |||
except Exception as err: | |||
common.dump_failed_product(config['action']['name'], cat_url, product_json, err, traceback.format_exc()) | |||
if __name__ == "__main__" : | |||
config = common.get_config() | |||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||
common.insert_update_store(producer, {'store': config['action']['name'], 'url': config['action']['url'], 'image_url': config['action']['logo']}) | |||
common.clear_failed_product(config['action']['name']) | |||
categories = get_categories(config['action']['categories_url']) | |||
for cat in categories: | |||
soup = common.get_soup_page_no_limit(f"{config['action']['url']}{categories[cat]}") | |||
subcat = soup.find_all("a", class_="subcategory-cta-list__cta") | |||
for sc in subcat: | |||
get_products(config, sc['href'], producer) | |||
common.update_store_prices(producer, config['action']) |
@ -0,0 +1,158 @@ | |||
#!/usr/bin/env python3 | |||
import requests | |||
import logging | |||
import xml.etree.ElementTree as ET | |||
import json | |||
import unidecode | |||
import sqlite3 | |||
import common | |||
import threading | |||
import re | |||
import traceback | |||
from bs4 import BeautifulSoup | |||
from datetime import datetime | |||
from kafka import KafkaProducer | |||
def get_categories(url, cookie): | |||
soup = common.get_soup_page_no_limit(url, cookie) | |||
categories = {} | |||
dropdown = soup.find('div', class_="mxd-mega-dropdown") | |||
anchors = dropdown.find_all('a') | |||
for anchor in anchors: | |||
if anchor['href'].find('bricosolar') >= 0: | |||
print(f'remove {anchor["href"]}') | |||
pass | |||
elif anchor['href'] not in categories.values(): | |||
if anchor['href'].find('https://www.brico.be') == -1: | |||
anchor['href'] = f'https://www.brico.be{anchor["href"]}' | |||
categories[anchor.text] = anchor['href'] | |||
return categories | |||
def get_product_details(prod_url, config, kafka_producer, failed): | |||
try: | |||
soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie']) | |||
prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout') | |||
if prod_detail is None: | |||
return | |||
prod_specs = soup.find('div', {'id': 'specs'}) | |||
title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1') | |||
product = {} | |||
price = {} | |||
product['title'] = title.find("span", {'itemprop': 'name'}).text | |||
product['url'] = prod_url | |||
product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src'] | |||
product['info'] = prod_specs.find_all("p")[0].text | |||
product['sku_code'] = prod_url.split("/")[-1] | |||
ean_pattern = "([0-9]{8,13})" | |||
if len(prod_specs.find_all("p")) > 1: | |||
ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text) | |||
product['ean_code'] = ean_match.group(1) | |||
pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)' | |||
pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"' | |||
scripts = soup.find_all("script") | |||
for script in scripts: | |||
if script.string is not None: | |||
if script.string.find('"brands"') >= 0: | |||
match = re.search(pattern_brand, script.string) | |||
product['brand'] = match.group(1) | |||
if script.string.find('"ean"') >= 0 and 'ean_code' not in product: | |||
match = re.search(pattern_ean, script.string) | |||
product['ean_code'] = match.group(1) | |||
if soup.find("ins") is None: | |||
return | |||
price['price'] = soup.find("ins").find("meta")['content'] | |||
price['promo'] = 0 if soup.find("del") is None else 1 | |||
common.add_product(kafka_producer, config['brico']['name'], product, price) | |||
except Exception as err: | |||
common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc()) | |||
def next_url(url_list): | |||
if len(url_list) == 0: | |||
return None | |||
key = next(iter(url_list)) | |||
url = url_list[key] | |||
del url_list[key] | |||
return url | |||
maxthreads = 10 | |||
sema = threading.Semaphore(value=maxthreads) | |||
threads = list() | |||
if __name__ == "__main__" : | |||
config = common.get_config() | |||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||
common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']}) | |||
common.clear_failed_product(config['brico']['name']) | |||
all_categories = get_categories(config['brico']['url'], config['brico']['cookie']) | |||
process_categories = all_categories.copy() | |||
url = next_url(process_categories) | |||
products = {} | |||
failed_prod = {} | |||
i = 0 | |||
while url is not None: | |||
i = i + 1 | |||
print(f'{i}/{len(all_categories)} - {len(process_categories)}') | |||
soup = common.get_soup_page_no_limit(url, config['brico']['cookie']) | |||
page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive") | |||
extra_cat = soup.find_all("a", class_="mxd-product-list-link") | |||
if len(extra_cat) > 0: | |||
for cat in extra_cat: | |||
if cat['href'].find('bricosolar') >= 0: | |||
print(f'main remove {cat["href"]}') | |||
pass | |||
elif cat['href'] not in all_categories.values(): | |||
cat_name = cat.find("span").text | |||
if cat['href'].find("/") > 0: | |||
cat['href'] = f'/{cat["href"]}' | |||
process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}' | |||
#print(f'added {config["brico"]["url"]}{cat["href"]}') | |||
all_categories[cat_name] = cat['href'] | |||
for product in page_products: | |||
product = product.find("a", class_="mxd-block-card-link") | |||
if product['href'].find("/nl") == -1: | |||
product['href'] = f'/nl{product["href"]}' | |||
if product['href'].find("/") > 0: | |||
product['href'] = f'/{product["href"]}' | |||
if product['href'] not in products.values(): | |||
prod_name = product.find("span", {"itemprop": "name"}).text | |||
#print(f'product url: {product["href"]}') | |||
products[prod_name] = product["href"] | |||
thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod)) | |||
threads.append(thread) | |||
thread.start() | |||
next_page = soup.find("a", {"rel": "next"}) | |||
if next_page is None: | |||
url = next_url(process_categories) | |||
else: | |||
base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"}) | |||
url = f'{base["href"]}{next_page["href"]}' | |||
print(f'next page: {url}') | |||
for t in threads: | |||
t.join() | |||
common.update_store_prices(producer, config['brico']) |
@ -0,0 +1,153 @@ | |||
#!/usr/bin/env python3 | |||
import requests | |||
import logging | |||
import xml.etree.ElementTree as ET | |||
import json | |||
import unidecode | |||
import sqlite3 | |||
import common | |||
import threading | |||
import re | |||
import traceback | |||
import sys | |||
import itertools | |||
import logging | |||
#logging.basicConfig(level=logging.DEBUG) | |||
from time import sleep | |||
from random import randint | |||
from bs4 import BeautifulSoup | |||
from datetime import datetime | |||
from kafka import KafkaProducer | |||
from ratelimit import limits, sleep_and_retry | |||
def get_categories(url): | |||
soup = common.get_soup_page(url) | |||
categories = {} | |||
articles = soup.find_all('article', class_="item bg-gray") | |||
for article in articles: | |||
anchor = article.find('a') | |||
anchor_title = anchor['title'].split(' | ')[0] | |||
categories[anchor_title] = anchor['href'] | |||
return categories | |||
def get_product_details(prod_url, config, kafka_producer, anchor_title): | |||
try: | |||
soup = common.get_soup_page(prod_url) | |||
if soup is None: | |||
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None) | |||
return | |||
prod_detail = soup.find('article') | |||
if prod_detail is None: | |||
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None) | |||
return | |||
title = prod_detail.find('h1', {'itemprop': 'name'}) | |||
image = prod_detail.find('a', class_='image-click').find('img') | |||
description = prod_detail.find('div', {'itemprop': 'description'}) | |||
ean_code = prod_detail.find('td', {'itemprop': 'eancode'}) | |||
sku = prod_detail.find('span', {'itemprop': 'sku'}) | |||
brand = prod_detail.find('span', {'itemprop': 'brand'}) | |||
price_detail = prod_detail.find('span', {'itemprop': 'price'}) | |||
product = {} | |||
price = {} | |||
if title is not None: | |||
product['title'] = title.text.strip() | |||
product['url'] = prod_url | |||
product['image_url'] = image['src'] | |||
if description is not None: | |||
product['info'] = description.text.strip() | |||
if sku is not None: | |||
product['sku_code'] = sku.text.strip() | |||
if ean_code is not None: | |||
product['ean_code'] = ean_code.text.strip() | |||
if brand is not None: | |||
product['brand'] = brand.text.strip() | |||
price['price'] = price_detail.text.split()[1] | |||
price['promo'] = 0 if soup.find("del") is None else 1 | |||
common.add_product(kafka_producer, config['brouwland']['name'], product, price) | |||
except Exception as err: | |||
print(traceback.format_exc()) | |||
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc()) | |||
def next_url(url_list): | |||
if len(url_list) == 0: | |||
return None | |||
key = next(iter(url_list)) | |||
url = url_list[key] | |||
del url_list[key] | |||
return url | |||
maxthreads = 5 | |||
sema = threading.Semaphore(value=maxthreads) | |||
threads = list() | |||
if __name__ == "__main__" : | |||
config = common.get_config() | |||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||
common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']}) | |||
common.clear_failed_product(config['brouwland']['name']) | |||
all_categories = get_categories(config['brouwland']['catalogue_url']) | |||
process_categories = all_categories.copy() | |||
url = next_url(process_categories) | |||
products = {} | |||
i = 0 | |||
while url is not None: | |||
i = i + 1 | |||
print(f'{i}/{len(all_categories)} - {len(process_categories)}') | |||
url = f'{config["brouwland"]["url"]}{url}' | |||
soup = common.get_soup_page(url) | |||
if soup is None: | |||
url = next_url(process_categories) | |||
continue | |||
page_products = soup.find_all("article", class_="product") | |||
extra_cat = soup.find_all("article", class_="item bg-gray") | |||
if len(extra_cat) > 0: | |||
for cat in extra_cat: | |||
anchor = cat.find('a') | |||
if anchor['href'] not in all_categories.values(): | |||
anchor_title = anchor['title'].split(' | ')[0] | |||
process_categories[anchor_title] = f'{anchor["href"]}' | |||
all_categories[anchor_title] = anchor['href'] | |||
#print(f'added {cat["data-href"]}') | |||
for product in page_products: | |||
anchor = product.find("a") | |||
if anchor['href'] not in products.values(): | |||
anchor_title = anchor['title'].split(' | ')[0] | |||
if anchor_title.upper().find("CADEAU") > -1: | |||
continue | |||
products[anchor_title] = anchor["href"] | |||
thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title)) | |||
threads.append(thread) | |||
thread.start() | |||
next_page = soup.find("a", class_="next") | |||
if next_page is None: | |||
url = next_url(process_categories) | |||
else: | |||
url = next_page["href"] | |||
for t in threads: | |||
t.join() | |||
common.update_store_prices(producer, config['brouwland']) |
@ -0,0 +1,180 @@ | |||
#!/usr/bin/env python3 | |||
import requests | |||
import logging | |||
import xml.etree.ElementTree as ET | |||
import json | |||
import unidecode | |||
import sqlite3 | |||
import common | |||
import threading | |||
import re | |||
import traceback | |||
import sys | |||
from bs4 import BeautifulSoup | |||
from datetime import datetime | |||
from kafka import KafkaProducer | |||
from ratelimit import limits, sleep_and_retry | |||
def get_categories(url): | |||
soup = dreamland_get_soup(url) | |||
categories = {} | |||
main_categories = soup.find_all("p", class_="subNav__categoryTitle") | |||
for cat in main_categories: | |||
anchor = cat.find("a") | |||
categories[anchor.text] = anchor['href'] | |||
return categories | |||
@sleep_and_retry | |||
@limits(calls=1, period=2) | |||
def dreamland_get_soup(url, payload=None, cookie=None, headers={}): | |||
return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers) | |||
def get_product_details(prod_url, config, kafka_producer): | |||
try: | |||
soup = dreamland_get_soup(prod_url) | |||
product = {} | |||
price = {} | |||
sku = soup.find("span", class_="sku").text.split(":")[1] | |||
title = soup.find("h1", class_="main_header").text | |||
image = soup.find("img", {'id': 'productMainImage'})['src'] | |||
desc = soup.find("div", class_="product_text").text | |||
attrs = soup.find("div", {'id': 'Attributes_table'}) | |||
items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')}) | |||
ean_list = [] | |||
for item in items: | |||
if item.text.find('Merk') > -1: | |||
product['brand'] = item.find("span").text.strip('">').strip() | |||
if item.text.find('EAN') > -1: | |||
ean_codes= item.find_all("span") | |||
for code in ean_codes: | |||
ean = code.text.strip('">').strip() | |||
ean_list.append(ean) | |||
product['sku_code'] = sku | |||
product['url'] = prod_url | |||
product['title'] = title | |||
product['image_url'] = image | |||
product['info'] = desc | |||
product['ean_code'] = ", ".join(ean_list) | |||
if soup.find("div", class_="price red mini") is None: | |||
price['price'] = soup.find("div", class_="product_price").text.strip("€\xa0") | |||
price['promo'] = 0 | |||
else: | |||
price['price'] = soup.find("div", class_="price red mini").text.strip("€\xa0") | |||
price['promo'] = 1 | |||
common.add_product(kafka_producer, config['dreamland']['name'], product, price) | |||
except Exception as err: | |||
common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc()) | |||
def get_productListView(soup): | |||
scripts = soup.find_all("script") | |||
list_view_pattern = "\'(ProductListingView.*)',{" | |||
listview = None | |||
if scripts is None: | |||
return listview | |||
for script in scripts: | |||
if script.string is not None: | |||
if script.string.find("ProductListingView") > 0: | |||
listview_match = re.search(list_view_pattern, script.string) | |||
listview = listview_match.group(1) | |||
return listview | |||
def next_url(url_list): | |||
if len(url_list) == 0: | |||
return None | |||
key = next(iter(url_list)) | |||
url = url_list[key] | |||
del url_list[key] | |||
return url | |||
def get_dreamland_productListingView(url, index=0): | |||
soup = None | |||
#payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax' | |||
headers = {} | |||
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' | |||
headers['content-type'] = 'application/x-www-form-urlencoded' | |||
payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax' | |||
soup = dreamland_get_soup(url, payload=payload, headers=headers) | |||
return soup | |||
maxthreads = 2 | |||
sema = threading.Semaphore(value=maxthreads) | |||
threads = list() | |||
if __name__ == "__main__": | |||
config = common.get_config() | |||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||
common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']}) | |||
common.clear_failed_product(config['dreamland']['name']) | |||
all_categories = get_categories(config['dreamland']['url']) | |||
process_categories = all_categories.copy() | |||
url = next_url(process_categories) | |||
products = {} | |||
i = 0 | |||
while url is not None: | |||
i = i + 1 | |||
print(f'{i}/{len(all_categories)} - {len(process_categories)}') | |||
soup = common.get_soup_page(url) | |||
if soup is None: | |||
url = next_url(process_categories) | |||
continue | |||
product_listview = get_productListView(soup) | |||
extra_cat = soup.find_all("li", class_="singleFacet") | |||
if len(extra_cat) > 0: | |||
for cat in extra_cat: | |||
anchor = cat.find("a") | |||
if anchor['href'] not in all_categories.values(): | |||
anchor_title = anchor.find("span", class_="facetName").text | |||
#print(f'added {anchor_title}- {anchor["href"]}') | |||
process_categories[anchor_title] = anchor["href"] | |||
all_categories[anchor_title] = anchor['href'] | |||
index = 0 | |||
while product_listview is not None: | |||
view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index) | |||
#view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}') | |||
view_products = view_soup.find_all("div", class_="product_info") | |||
for product in view_products: | |||
index = index + 1 | |||
name = product.find("div", class_="product_name") | |||
anchor = product.find("a") | |||
if anchor['href'] not in products.values(): | |||
products[name] = anchor["href"] | |||
thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer)) | |||
threads.append(thread) | |||
thread.start() | |||
next_arrow = view_soup.find("a", class_="right_arrow") | |||
if next_arrow is None: | |||
product_listview = None | |||
url = next_url(process_categories) | |||
for t in threads: | |||
t.join() | |||
common.update_store_prices(producer, config['dreamland']) |
@ -0,0 +1,166 @@ | |||
#!/usr/bin/env python3 | |||
import requests | |||
import logging | |||
import xml.etree.ElementTree as ET | |||
import json | |||
import unidecode | |||
import sqlite3 | |||
import common | |||
import threading | |||
import traceback | |||
from bs4 import BeautifulSoup | |||
from datetime import datetime | |||
from kafka import KafkaProducer | |||
def get_categories(cat_url): | |||
page = requests.get(cat_url) | |||
content = page.content[1:-1].decode('utf-8') | |||
content = content.replace("\\r\\n", "") | |||
content = content.replace('\\/', '/') | |||
content = content.replace('\\"', '"') | |||
content = content.replace(' & ', '&') | |||
root = ET.fromstring(content) | |||
categories = {} | |||
for elm in list(root.iter()): | |||
if elm.tag == "a": | |||
if (elm.attrib['href'] == '#') or (elm.text is None): | |||
continue | |||
if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0): | |||
continue | |||
if elm.attrib['href'] not in categories.values(): | |||
categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "") | |||
return categories | |||
def next_url(url_list): | |||
if len(url_list) == 0: | |||
return None | |||
key = next(iter(url_list)) | |||
url = url_list[key] | |||
del url_list[key] | |||
return url | |||
def get_product_details(prod_url, config, kafka_producer): | |||
soup = common.get_soup_page_no_limit(prod_url) | |||
if soup is None: | |||
return | |||
try: | |||
prod_view = soup.find("div", class_="product-view") | |||
prod_essential = prod_view.find("div", class_="product-essential") | |||
image = prod_view.find("img", {'itemprop': 'image'})['src'] | |||
price = prod_essential.find("meta", {'itemprop': 'price'})['content'] | |||
special_price = prod_essential.find("p", class_="special-price") | |||
promo = False | |||
promo_end = None | |||
if special_price is not None: | |||
promo = True | |||
promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"}) | |||
if promo_end is not None: | |||
promo_end = promo_end['content'] | |||
title = prod_view.find("h1", {'itemprop': 'name'}).text | |||
sku = prod_view.find("meta", {'itemprop': 'sku'})['content'] | |||
brand = prod_view.find("meta", {'itemprop': 'brand'})['content'] | |||
description = prod_view.find("meta", {'itemprop': 'description'})['content'] | |||
specs = prod_view.find("div", class_="tab-content tab-block-additional") | |||
spec_li = specs.find_all("li") | |||
info = '' | |||
for spec in spec_li: | |||
label = spec.find("span", class_="label").text | |||
content = spec.find("span", class_="data").text | |||
if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1: | |||
info = f'{info}{label}: {content} / ' | |||
info = info[:-3] | |||
ean_code = '' | |||
ean_list = prod_view.find_all("li") | |||
for elm in ean_list: | |||
if elm is not None: | |||
if elm.text.upper().find("EAN") >= 0: | |||
ean_code = elm.find("span", class_="data").text | |||
product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info} | |||
price_details = {'price': price, "promo": promo, "promo_end": promo_end} | |||
common.add_product(kafka_producer, config['fun']['name'], product_details, price_details) | |||
except Exception as err: | |||
common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc()) | |||
maxthreads = 10 | |||
sema = threading.Semaphore(value=maxthreads) | |||
threads = list() | |||
if __name__ == "__main__": | |||
config = common.get_config() | |||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||
common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']}) | |||
common.clear_failed_product(config['fun']['name']) | |||
all_categories = get_categories(config['fun']['categories_url']) | |||
process_categories = all_categories.copy() | |||
url = next_url(process_categories) | |||
products = {} | |||
i = 0 | |||
while url is not None: | |||
i = i + 1 | |||
print(f'{i}/{len(all_categories)} - {len(process_categories)}') | |||
cat_url = config["fun"]["url"] + url | |||
soup = common.get_soup_page_no_limit(cat_url) | |||
subcat = soup.find_all("li", class_="item last") | |||
if len(subcat) == 0: | |||
big_urls = soup.find_all("big") | |||
for b_url in big_urls: | |||
b_href = b_url.find("a") | |||
if b_href: | |||
try: | |||
b_url = b_href['href'].replace(config["fun"]["url"], "") | |||
if b_url not in all_categories.values(): | |||
process_categories[b_href.text] = b_url | |||
all_categories[b_href.text] = b_url | |||
except Exception as err: | |||
print(url) | |||
print("+++++++++++++") | |||
print(cat_url) | |||
print("+++++++++++++") | |||
print(b_href) | |||
print("+++++++++++++") | |||
print(err) | |||
print("=============") | |||
url = next_url(process_categories) | |||
else: | |||
for sc in subcat: | |||
product = sc.find("h2", class_="product-name") | |||
p_info = product.find("a") | |||
if p_info['href'] not in products.values(): | |||
products[p_info['title']] = p_info['href'] | |||
thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer)) | |||
threads.append(thread) | |||
thread.start() | |||
next_page = soup.find("a", class_="next i-next") | |||
if next_page is None: | |||
url = next_url(process_categories) | |||
else: | |||
url = next_page['href'].replace(config["fun"]["url"], "") | |||
for t in threads: | |||
t.join() | |||
common.update_store_prices(producer, config['fun']) |
@ -0,0 +1,95 @@ | |||
#!/usr/bin/env python3 | |||
import requests | |||
import logging | |||
import xml.etree.ElementTree as ET | |||
import json | |||
import unidecode | |||
import sqlite3 | |||
import common | |||
import threading | |||
import re | |||
import traceback | |||
import sys | |||
from bs4 import BeautifulSoup | |||
from datetime import datetime | |||
from kafka import KafkaProducer | |||
def get_product_details(prod_url, config, kafka_producer, json_product): | |||
try: | |||
soup = common.get_soup_page_no_limit(prod_url) | |||
description = soup.find('div', class_='pdp-description__content') | |||
ean_row = soup.find_all('div', class_='row border-bottom') | |||
product = {} | |||
price = {} | |||
product['title'] = json_product['title'] | |||
product['url'] = prod_url | |||
if 'image' in json_product: | |||
product['image_url'] = json_product['image'] | |||
if description.find("p") is not None: | |||
product['info'] = description.find("p").text | |||
product['sku_code'] = json_product['sku'] | |||
for row in ean_row: | |||
if row.text.find('EAN- / barcode') > -1: | |||
ean_pattern = "([0-9]{8,13})" | |||
ean_match = re.search(ean_pattern, row.text) | |||
product['ean_code'] = ean_match.group(1) | |||
if 'brand' in json_product: | |||
product['brand'] = json_product['brand'] | |||
price['price'] = json_product['price'] | |||
if 'discount' in json_product: | |||
price['promo'] = 1 | |||
else: | |||
price['promo'] = 0 | |||
common.add_product(kafka_producer, config['hubo']['name'], product, price) | |||
except Exception as err: | |||
common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc()) | |||
def next_url(url_list): | |||
if len(url_list) == 0: | |||
return None | |||
key = next(iter(url_list)) | |||
url = url_list[key] | |||
del url_list[key] | |||
return url | |||
maxthreads = 5 | |||
sema = threading.Semaphore(value=maxthreads) | |||
threads = list() | |||
if __name__ == "__main__" : | |||
config = common.get_config() | |||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||
common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']}) | |||
common.clear_failed_product(config['hubo']['name']) | |||
get_doc = requests.get(config['hubo']['products_url']) | |||
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore'))) | |||
offset = 1 | |||
while len(data_json['docs']) > 0: | |||
print(f'{offset}/{data_json["doc_count"]}') | |||
for product in data_json['docs']: | |||
thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product)) | |||
threads.append(thread) | |||
thread.start() | |||
offset = offset + int(data_json['limit']) | |||
get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset))) | |||
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore'))) | |||
for t in threads: | |||
t.join() | |||
common.update_store_prices(producer, config['hubo']) |