@ -0,0 +1,54 @@ | |||||
config: | |||||
loglevel: logging.DEBUG | |||||
log_sql: False | |||||
sqlitedb: cheap_shopper.db | |||||
active_stores: action, fun, brico | |||||
kafka_boostrap_servers: apollo.home.lan | |||||
action: | |||||
name: Action | |||||
url: https://www.action.com | |||||
logo: https://upload.wikimedia.org/wikipedia/commons/4/45/Action_Nederland_Logo_2020.svg | |||||
categories_url: https://www.action.com/api/navigation/categories?language=nl-BE | |||||
products_url: https://www.action.com/api/subcategories/products/?routeSegment=__url__&offset=__offset__&limit=__limit__ | |||||
products_cookie: {'epslanguage': 'nl-BE'} | |||||
brico: | |||||
name: Brico | |||||
url: https://www.brico.be | |||||
logo: https://vdm.bricocdn.be/logos/brico.svg?ie | |||||
cookie: {'language': 'nl'} | |||||
brouwland: | |||||
name: Brouwland | |||||
url: https://www.brouwland.com | |||||
logo: https://brouwlandprod-yappa.netdna-ssl.com/build/images/logo-brouwland2.2fc01423.png | |||||
catalogue_url: https://www.brouwland.com/nl/onze-producten | |||||
dreamland: | |||||
name: Dreamland | |||||
url: https://www.dreamland.be/e/nl/dl | |||||
logo: https://seeklogo.com/images/D/dreamland-be-logo-7F2C0508F2-seeklogo.com.png | |||||
delhaize: | |||||
name: Delhaize | |||||
url: https://delhaize.be/shop | |||||
logo: | |||||
fun: | |||||
name: Fun | |||||
url: https://www.fun.be | |||||
logo: https://e7.pngegg.com/pngimages/935/634/png-clipart-fun-logo-fun-toyshop-logo-icons-logos-emojis-toy-shop-logos.png | |||||
categories_url: https://www.fun.be/seall/menu/index | |||||
gamma: | |||||
name: Gamma | |||||
url: https://www.gamma.be | |||||
logo: https://nl.wikipedia.org/wiki/Gamma_(winkel)#/media/Bestand:Gamma_logo_2010.png | |||||
categories_url: https://www.gamma.be/nl/resources/menu/categories | |||||
hubo: | |||||
name: Hubo | |||||
url: https://www.hubo.be | |||||
logo: https://www.hubo.be/content/dam/hubo/site-afbeeldingen/hubo-algemeen/logo.svg | |||||
categories_url: https://www.hubo.be/nl/a.html | |||||
products_url: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&limit=60 | |||||
products_url_offset: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&offset=__OFFSET__ | |||||
intratuin_nl: | |||||
name: Intratuin NL | |||||
url: https://www.intratuin.nl | |||||
intratuin_be: | |||||
name: Intratuin BE | |||||
url: https://www.intratuin.be |
@ -0,0 +1,152 @@ | |||||
import yaml | |||||
import json | |||||
import os | |||||
import shutil | |||||
import traceback | |||||
import requests | |||||
import re | |||||
from bs4 import BeautifulSoup | |||||
from ratelimit import limits, sleep_and_retry | |||||
from time import sleep | |||||
from random import randint | |||||
from datetime import datetime | |||||
def get_config(): | |||||
with open('cheap_shopper.yaml', 'r') as ymlfile: | |||||
cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) | |||||
return cfg | |||||
def add_product(kafka_producer, store, product, price): | |||||
db_object = {} | |||||
db_object['type'] = 'product' | |||||
db_object['store'] = store | |||||
db_object['product'] = product | |||||
db_object['price'] = price | |||||
db_object_json = json.dumps(db_object) | |||||
db_object_bytes = bytearray(db_object_json, 'utf-8') | |||||
send_kafka_message(kafka_producer, db_object_bytes) | |||||
def insert_update_store(kafka_producer, store): | |||||
db_object = {} | |||||
db_object['type'] = 'store' | |||||
db_object['store'] = store | |||||
db_object_json = json.dumps(db_object) | |||||
db_object_bytes = bytearray(db_object_json, 'utf-8') | |||||
send_kafka_message(kafka_producer, db_object_bytes) | |||||
def send_kafka_message(kafka_producer, message): | |||||
kafka_producer.send('shopper_db', message) | |||||
def update_store_prices(kafka_producer, config): | |||||
insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')}) | |||||
db_object = {} | |||||
db_object['type'] = 'store_update' | |||||
db_object['store'] = config['name'] | |||||
db_object_json = json.dumps(db_object) | |||||
db_object_bytes = bytearray(db_object_json, 'utf-8') | |||||
send_kafka_message(kafka_producer, db_object_bytes) | |||||
print(f'updating prices {db_object_json}') | |||||
def clear_failed_product(store): | |||||
if not os.path.exists('failed'): | |||||
os.mkdir('failed') | |||||
if not os.path.exists(f'failed/{store}'): | |||||
os.mkdir(f'failed/{store}') | |||||
return | |||||
else: | |||||
for dir in os.listdir(f'failed/{store}/'): | |||||
shutil.rmtree(f'failed/{store}/{dir}') | |||||
def dump_failed_product(store, prod_url, page, err, trace): | |||||
if not os.path.exists('failed'): | |||||
os.mkdir('failed') | |||||
if not os.path.exists(f'failed/{store}'): | |||||
os.mkdir(f'failed/{store}') | |||||
dirname = prod_url.replace('https://','') | |||||
dirname = dirname.replace('.','-') | |||||
dirname = dirname.replace('/','_') | |||||
os.mkdir(f'failed/{store}/{dirname}') | |||||
err_file = open(f'failed/{store}/{dirname}/error.txt', "w") | |||||
err_file.write(f'{prod_url}\n') | |||||
err_file.write('===========================================\n') | |||||
err_file.write(f'{str(err)}\n') | |||||
err_file.write('===========================================\n') | |||||
err_file.write(str(trace)) | |||||
err_file.close() | |||||
page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8") | |||||
page_file.write(str(page)) | |||||
page_file.close() | |||||
def get_proxies(): | |||||
page = requests.get("https://free-proxy-list.net/") | |||||
soup = BeautifulSoup(page.content, "html.parser") | |||||
raw_div = soup.find("div", {"id": "raw"}) | |||||
raw_textarea = raw_div.find("textarea") | |||||
ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text) | |||||
ip_dict = {} | |||||
for ip in ip_list: | |||||
ip_addr = ip.split(":")[0] | |||||
ip_port = ip.split(":")[1] | |||||
ip_dict[ip_addr] = ip_port | |||||
return ip_list | |||||
@sleep_and_retry | |||||
@limits(calls=2, period=1) | |||||
def get_soup_page(url, cookie=None): | |||||
#print(f'get page for soup: {url}') | |||||
sleep(randint(1,2)) | |||||
soup = None | |||||
try: | |||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'} | |||||
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)}) | |||||
page = requests.get(url, cookies=cookie, headers=headers) | |||||
if page.status_code != 200 and page.status_code != 301: | |||||
return None | |||||
soup = BeautifulSoup(page.content, "html.parser") | |||||
except ConnectionRefusedError: | |||||
print(traceback.format_exc()) | |||||
sleep(randint(2,3)) | |||||
soup = get_soup_page(url) | |||||
except Exception as err: | |||||
print(traceback.format_exc()) | |||||
print(err) | |||||
return soup | |||||
def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}): | |||||
#print(f'get page for soup: {url}') | |||||
sleep(randint(1,2)) | |||||
soup = None | |||||
try: | |||||
if len(headers) == 0: | |||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', | |||||
'content-type':'text/html;charset=UTF-8'} | |||||
if 'user-agent' not in headers: | |||||
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' | |||||
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51' | |||||
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)}) | |||||
page = requests.get(url, cookies=cookie, data=payload, headers=headers) | |||||
#print(page.content) | |||||
if page.status_code != 200 and page.status_code != 301: | |||||
return None | |||||
soup = BeautifulSoup(page.content, "html.parser") | |||||
except Exception as err: | |||||
print(traceback.format_exc()) | |||||
print(err) | |||||
return soup |
@ -0,0 +1,187 @@ | |||||
#!/usr/bin/env python3 | |||||
import sqlite3 | |||||
import common | |||||
import json | |||||
import time | |||||
import threading | |||||
from datetime import datetime | |||||
from kafka import KafkaConsumer | |||||
def committer(connection): | |||||
while True: | |||||
connection.commit() | |||||
time.sleep(10) | |||||
def insert_update_store(connection, stores, value_json): | |||||
cursor = connection.cursor() | |||||
columns = '(' | |||||
values_sql = f'(' | |||||
values = [] | |||||
update_sql = '' | |||||
update_values = [] | |||||
for dict_key in value_json['store']: | |||||
columns = f'{columns}{dict_key}, ' | |||||
values.append(value_json['store'][dict_key]) | |||||
values_sql = f'{values_sql}?, ' | |||||
if dict_key != 'store': | |||||
update_sql = f'{update_sql}{dict_key}=?, ' | |||||
update_values.append(value_json['store'][dict_key]) | |||||
columns = columns[:-2] + ')' | |||||
values_sql = values_sql[:-2] + ')' | |||||
update_sql = update_sql[:-2] | |||||
update_values.append(value_json["store"]["store"]) | |||||
try: | |||||
sql_statement = f'INSERT INTO store {columns} VALUES {values_sql}' | |||||
cursor.execute(sql_statement, values) | |||||
stores[value_json["store"]["store"]] = cursor.lastrowid | |||||
except sqlite3.IntegrityError as err: | |||||
try: | |||||
sql_statement = f'UPDATE store SET {update_sql} WHERE store=?' | |||||
cursor.execute(sql_statement, tuple(update_values)) | |||||
except Exception as err: | |||||
print(err) | |||||
except Exception as err: | |||||
print(err) | |||||
def insert_update_product(connection, stores, value_json): | |||||
cursor = connection.cursor() | |||||
columns = '(store, ' | |||||
values_sql = f'(?, ' | |||||
values = [stores[value_json["store"]]] | |||||
update_sql = '' | |||||
update_values = [] | |||||
for dict_key in value_json['product']: | |||||
columns = f'{columns}{dict_key}, ' | |||||
values.append(value_json['product'][dict_key]) | |||||
values_sql = f'{values_sql}?, ' | |||||
if dict_key != 'sku_code': | |||||
update_sql = f'{update_sql}{dict_key}=?, ' | |||||
update_values.append(value_json['product'][dict_key]) | |||||
columns = columns[:-2] + ')' | |||||
values_sql = values_sql[:-2] + ')' | |||||
update_sql = update_sql[:-2] | |||||
update_values.append(stores[value_json["store"]]) | |||||
update_values.append(value_json["product"]["sku_code"]) | |||||
product_id = None | |||||
try: | |||||
sql_statement = f'INSERT INTO products {columns} VALUES {values_sql}' | |||||
cursor.execute(sql_statement, values) | |||||
product_id = cursor.lastrowid | |||||
#print(f'inserted {product_id}') | |||||
except sqlite3.IntegrityError as err: | |||||
try: | |||||
sql_statement = f'UPDATE products SET {update_sql} WHERE store=? and sku_code=?' | |||||
cursor.execute(sql_statement, tuple(update_values)) | |||||
sql_statement = f'SELECT id FROM products WHERE store=? and sku_code=?' | |||||
cursor.execute(sql_statement, (stores[value_json["store"]], value_json["product"]["sku_code"])) | |||||
product_id = cursor.fetchone()[0] | |||||
#print(f'updated {product_id}') | |||||
except Exception as err: | |||||
print(err) | |||||
except Exception as err: | |||||
print(err) | |||||
insert_update_price(connection, stores, value_json, product_id) | |||||
def insert_update_price(connection, stores, value_json, product_id): | |||||
cursor = connection.cursor() | |||||
columns = '(product_id, last_update, ' | |||||
values_sql = f'(?, ?, ' | |||||
values = [product_id, datetime.now().strftime('%d/%m/%Y')] | |||||
for dict_key in value_json['price']: | |||||
columns = f'{columns}{dict_key}, ' | |||||
values.append(value_json['price'][dict_key]) | |||||
values_sql = f'{values_sql}?, ' | |||||
columns = columns[:-2] + ')' | |||||
values_sql = values_sql[:-2] + ')' | |||||
price_exists = False | |||||
try: | |||||
sql_statement = f'SELECT id FROM price WHERE product_id=? and price=? and active=1' | |||||
cursor.execute(sql_statement, (product_id, value_json["price"]["price"])) | |||||
if cursor.fetchone(): | |||||
price_exists = True | |||||
except Exception as err: | |||||
print(err) | |||||
try: | |||||
if not price_exists: | |||||
sql_statement = f'INSERT INTO price {columns} VALUES {values_sql}' | |||||
cursor.execute(sql_statement, values) | |||||
else: | |||||
sql_statement = f'UPDATE price SET last_update=? WHERE product_id=? and price=? and active=1' | |||||
cursor.execute(sql_statement, (datetime.now().strftime("%d/%m/%Y"), product_id, value_json["price"]["price"])) | |||||
except sqlite3.IntegrityError as err: | |||||
print(err) | |||||
except Exception as err: | |||||
print(err) | |||||
def deactivate_old_price(connection, stores, value_json): | |||||
cursor = connection.cursor() | |||||
try: | |||||
sql_statement=f'UPDATE price SET active=0 WHERE product_id IN (SELECT id FROM products WHERE store=?) AND last_update<?' | |||||
cursor.execute(sql_statement, (stores[value_json["store"]], datetime.now().strftime("%d/%m/%Y"))) | |||||
#print(sql_statement) | |||||
except Exception as err: | |||||
print(err) | |||||
def insert(connection, stores, value_json): | |||||
if value_json['type'] == 'store': | |||||
print('insert_update_store') | |||||
insert_update_store(connection, stores, value_json) | |||||
if value_json['type'] == 'store_update': | |||||
print('deactivate_old_price') | |||||
deactivate_old_price(connection, stores, value_json) | |||||
if value_json['type'] == 'product': | |||||
#print('insert_update_product') | |||||
insert_update_product(connection, stores, value_json) | |||||
def get_stores(connection): | |||||
stores = {} | |||||
cursor = con.cursor() | |||||
sql_statement = 'SELECT store, id FROM store' | |||||
cursor.execute(sql_statement) | |||||
result = cursor.fetchall() | |||||
for res in result: | |||||
stores[res[0]] = res[1] | |||||
return stores | |||||
if __name__ == '__main__': | |||||
config = common.get_config() | |||||
con = sqlite3.connect(config['config']['sqlitedb'], check_same_thread=False) | |||||
if config['config']['log_sql']: | |||||
con.set_trace_callback(print) | |||||
stores = get_stores(con) | |||||
commit_thread = threading.Thread(target=committer, args=(con,), daemon=True) | |||||
commit_thread.start() | |||||
consumer = KafkaConsumer('shopper_db',bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||||
for msg in consumer: | |||||
value_json = json.loads(msg.value.decode("utf-8")) | |||||
insert(con, stores, value_json) |
@ -0,0 +1,89 @@ | |||||
#!/usr/bin/env python3 | |||||
import requests | |||||
import logging | |||||
import xml.etree.ElementTree as ET | |||||
import json | |||||
import unidecode | |||||
import sqlite3 | |||||
import common | |||||
import traceback | |||||
import sys | |||||
from bs4 import BeautifulSoup | |||||
from datetime import datetime | |||||
from kafka import KafkaProducer | |||||
def get_categories(url): | |||||
page = requests.get(url) | |||||
root = ET.fromstring(page.content) | |||||
categories = {} | |||||
for cat_elm in root.findall("./Data/CategoryNavigationViewModel"): | |||||
cat_label = cat_elm.find("Label") | |||||
cat_url = cat_elm.find("Url") | |||||
categories[cat_label.text] = cat_url.text | |||||
return categories | |||||
def get_products(config, cat_url, kafka_producer): | |||||
url = config['action']['products_url'] | |||||
url = url.replace("__url__", cat_url) | |||||
url = url.replace("__offset__", "0") | |||||
total_url = url.replace("__limit__", "1") | |||||
products_cookie = config['action']['products_cookie'] | |||||
get_total = requests.get(total_url, cookies=products_cookie) | |||||
data_json = json.loads(unidecode.unidecode(get_total.content.decode('utf-8', 'ignore'))) | |||||
totalCount = data_json['totalCount'] | |||||
all_url = url.replace("__limit__", str(totalCount)) | |||||
products = requests.get(all_url, cookies=products_cookie) | |||||
products_json = json.loads(unidecode.unidecode(products.content.decode('utf-8', 'ignore'))) | |||||
for product_json in products_json['items']: | |||||
try: | |||||
if product_json['type'] != 'product': | |||||
continue | |||||
product = {} | |||||
price = {} | |||||
product['title'] = product_json['title'] | |||||
product['url'] = config['action']['url'] + product_json['url'] | |||||
product['image_url'] = config['action']['url'] + product_json['imageUrl'] | |||||
product['info'] = product_json['subTitle'] | |||||
product['sku_code'] = product_json['code'] | |||||
for spec in product_json['specifications']: | |||||
if spec['id'] == 'attEANCodeVariant': | |||||
product['ean_code'] = spec['value'] | |||||
if spec['id'] == 'attLongDescription': | |||||
product['description'] = spec['value'] | |||||
product['brand'] = product_json['brandName'] | |||||
price['price'] = product_json['price'] | |||||
price['promo'] = 0 if product_json['isDeal'] == False else 1 | |||||
price['promo_start'] = product_json['dealStartDate'] | |||||
price['promo_end'] = product_json['dealEndDate'] | |||||
common.add_product(kafka_producer, config['action']['name'], product, price) | |||||
except Exception as err: | |||||
common.dump_failed_product(config['action']['name'], cat_url, product_json, err, traceback.format_exc()) | |||||
if __name__ == "__main__" : | |||||
config = common.get_config() | |||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||||
common.insert_update_store(producer, {'store': config['action']['name'], 'url': config['action']['url'], 'image_url': config['action']['logo']}) | |||||
common.clear_failed_product(config['action']['name']) | |||||
categories = get_categories(config['action']['categories_url']) | |||||
for cat in categories: | |||||
soup = common.get_soup_page_no_limit(f"{config['action']['url']}{categories[cat]}") | |||||
subcat = soup.find_all("a", class_="subcategory-cta-list__cta") | |||||
for sc in subcat: | |||||
get_products(config, sc['href'], producer) | |||||
common.update_store_prices(producer, config['action']) |
@ -0,0 +1,158 @@ | |||||
#!/usr/bin/env python3 | |||||
import requests | |||||
import logging | |||||
import xml.etree.ElementTree as ET | |||||
import json | |||||
import unidecode | |||||
import sqlite3 | |||||
import common | |||||
import threading | |||||
import re | |||||
import traceback | |||||
from bs4 import BeautifulSoup | |||||
from datetime import datetime | |||||
from kafka import KafkaProducer | |||||
def get_categories(url, cookie): | |||||
soup = common.get_soup_page_no_limit(url, cookie) | |||||
categories = {} | |||||
dropdown = soup.find('div', class_="mxd-mega-dropdown") | |||||
anchors = dropdown.find_all('a') | |||||
for anchor in anchors: | |||||
if anchor['href'].find('bricosolar') >= 0: | |||||
print(f'remove {anchor["href"]}') | |||||
pass | |||||
elif anchor['href'] not in categories.values(): | |||||
if anchor['href'].find('https://www.brico.be') == -1: | |||||
anchor['href'] = f'https://www.brico.be{anchor["href"]}' | |||||
categories[anchor.text] = anchor['href'] | |||||
return categories | |||||
def get_product_details(prod_url, config, kafka_producer, failed): | |||||
try: | |||||
soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie']) | |||||
prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout') | |||||
if prod_detail is None: | |||||
return | |||||
prod_specs = soup.find('div', {'id': 'specs'}) | |||||
title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1') | |||||
product = {} | |||||
price = {} | |||||
product['title'] = title.find("span", {'itemprop': 'name'}).text | |||||
product['url'] = prod_url | |||||
product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src'] | |||||
product['info'] = prod_specs.find_all("p")[0].text | |||||
product['sku_code'] = prod_url.split("/")[-1] | |||||
ean_pattern = "([0-9]{8,13})" | |||||
if len(prod_specs.find_all("p")) > 1: | |||||
ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text) | |||||
product['ean_code'] = ean_match.group(1) | |||||
pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)' | |||||
pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"' | |||||
scripts = soup.find_all("script") | |||||
for script in scripts: | |||||
if script.string is not None: | |||||
if script.string.find('"brands"') >= 0: | |||||
match = re.search(pattern_brand, script.string) | |||||
product['brand'] = match.group(1) | |||||
if script.string.find('"ean"') >= 0 and 'ean_code' not in product: | |||||
match = re.search(pattern_ean, script.string) | |||||
product['ean_code'] = match.group(1) | |||||
if soup.find("ins") is None: | |||||
return | |||||
price['price'] = soup.find("ins").find("meta")['content'] | |||||
price['promo'] = 0 if soup.find("del") is None else 1 | |||||
common.add_product(kafka_producer, config['brico']['name'], product, price) | |||||
except Exception as err: | |||||
common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc()) | |||||
def next_url(url_list): | |||||
if len(url_list) == 0: | |||||
return None | |||||
key = next(iter(url_list)) | |||||
url = url_list[key] | |||||
del url_list[key] | |||||
return url | |||||
maxthreads = 10 | |||||
sema = threading.Semaphore(value=maxthreads) | |||||
threads = list() | |||||
if __name__ == "__main__" : | |||||
config = common.get_config() | |||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||||
common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']}) | |||||
common.clear_failed_product(config['brico']['name']) | |||||
all_categories = get_categories(config['brico']['url'], config['brico']['cookie']) | |||||
process_categories = all_categories.copy() | |||||
url = next_url(process_categories) | |||||
products = {} | |||||
failed_prod = {} | |||||
i = 0 | |||||
while url is not None: | |||||
i = i + 1 | |||||
print(f'{i}/{len(all_categories)} - {len(process_categories)}') | |||||
soup = common.get_soup_page_no_limit(url, config['brico']['cookie']) | |||||
page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive") | |||||
extra_cat = soup.find_all("a", class_="mxd-product-list-link") | |||||
if len(extra_cat) > 0: | |||||
for cat in extra_cat: | |||||
if cat['href'].find('bricosolar') >= 0: | |||||
print(f'main remove {cat["href"]}') | |||||
pass | |||||
elif cat['href'] not in all_categories.values(): | |||||
cat_name = cat.find("span").text | |||||
if cat['href'].find("/") > 0: | |||||
cat['href'] = f'/{cat["href"]}' | |||||
process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}' | |||||
#print(f'added {config["brico"]["url"]}{cat["href"]}') | |||||
all_categories[cat_name] = cat['href'] | |||||
for product in page_products: | |||||
product = product.find("a", class_="mxd-block-card-link") | |||||
if product['href'].find("/nl") == -1: | |||||
product['href'] = f'/nl{product["href"]}' | |||||
if product['href'].find("/") > 0: | |||||
product['href'] = f'/{product["href"]}' | |||||
if product['href'] not in products.values(): | |||||
prod_name = product.find("span", {"itemprop": "name"}).text | |||||
#print(f'product url: {product["href"]}') | |||||
products[prod_name] = product["href"] | |||||
thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod)) | |||||
threads.append(thread) | |||||
thread.start() | |||||
next_page = soup.find("a", {"rel": "next"}) | |||||
if next_page is None: | |||||
url = next_url(process_categories) | |||||
else: | |||||
base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"}) | |||||
url = f'{base["href"]}{next_page["href"]}' | |||||
print(f'next page: {url}') | |||||
for t in threads: | |||||
t.join() | |||||
common.update_store_prices(producer, config['brico']) |
@ -0,0 +1,153 @@ | |||||
#!/usr/bin/env python3 | |||||
import requests | |||||
import logging | |||||
import xml.etree.ElementTree as ET | |||||
import json | |||||
import unidecode | |||||
import sqlite3 | |||||
import common | |||||
import threading | |||||
import re | |||||
import traceback | |||||
import sys | |||||
import itertools | |||||
import logging | |||||
#logging.basicConfig(level=logging.DEBUG) | |||||
from time import sleep | |||||
from random import randint | |||||
from bs4 import BeautifulSoup | |||||
from datetime import datetime | |||||
from kafka import KafkaProducer | |||||
from ratelimit import limits, sleep_and_retry | |||||
def get_categories(url): | |||||
soup = common.get_soup_page(url) | |||||
categories = {} | |||||
articles = soup.find_all('article', class_="item bg-gray") | |||||
for article in articles: | |||||
anchor = article.find('a') | |||||
anchor_title = anchor['title'].split(' | ')[0] | |||||
categories[anchor_title] = anchor['href'] | |||||
return categories | |||||
def get_product_details(prod_url, config, kafka_producer, anchor_title): | |||||
try: | |||||
soup = common.get_soup_page(prod_url) | |||||
if soup is None: | |||||
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None) | |||||
return | |||||
prod_detail = soup.find('article') | |||||
if prod_detail is None: | |||||
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None) | |||||
return | |||||
title = prod_detail.find('h1', {'itemprop': 'name'}) | |||||
image = prod_detail.find('a', class_='image-click').find('img') | |||||
description = prod_detail.find('div', {'itemprop': 'description'}) | |||||
ean_code = prod_detail.find('td', {'itemprop': 'eancode'}) | |||||
sku = prod_detail.find('span', {'itemprop': 'sku'}) | |||||
brand = prod_detail.find('span', {'itemprop': 'brand'}) | |||||
price_detail = prod_detail.find('span', {'itemprop': 'price'}) | |||||
product = {} | |||||
price = {} | |||||
if title is not None: | |||||
product['title'] = title.text.strip() | |||||
product['url'] = prod_url | |||||
product['image_url'] = image['src'] | |||||
if description is not None: | |||||
product['info'] = description.text.strip() | |||||
if sku is not None: | |||||
product['sku_code'] = sku.text.strip() | |||||
if ean_code is not None: | |||||
product['ean_code'] = ean_code.text.strip() | |||||
if brand is not None: | |||||
product['brand'] = brand.text.strip() | |||||
price['price'] = price_detail.text.split()[1] | |||||
price['promo'] = 0 if soup.find("del") is None else 1 | |||||
common.add_product(kafka_producer, config['brouwland']['name'], product, price) | |||||
except Exception as err: | |||||
print(traceback.format_exc()) | |||||
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc()) | |||||
def next_url(url_list): | |||||
if len(url_list) == 0: | |||||
return None | |||||
key = next(iter(url_list)) | |||||
url = url_list[key] | |||||
del url_list[key] | |||||
return url | |||||
maxthreads = 5 | |||||
sema = threading.Semaphore(value=maxthreads) | |||||
threads = list() | |||||
if __name__ == "__main__" : | |||||
config = common.get_config() | |||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||||
common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']}) | |||||
common.clear_failed_product(config['brouwland']['name']) | |||||
all_categories = get_categories(config['brouwland']['catalogue_url']) | |||||
process_categories = all_categories.copy() | |||||
url = next_url(process_categories) | |||||
products = {} | |||||
i = 0 | |||||
while url is not None: | |||||
i = i + 1 | |||||
print(f'{i}/{len(all_categories)} - {len(process_categories)}') | |||||
url = f'{config["brouwland"]["url"]}{url}' | |||||
soup = common.get_soup_page(url) | |||||
if soup is None: | |||||
url = next_url(process_categories) | |||||
continue | |||||
page_products = soup.find_all("article", class_="product") | |||||
extra_cat = soup.find_all("article", class_="item bg-gray") | |||||
if len(extra_cat) > 0: | |||||
for cat in extra_cat: | |||||
anchor = cat.find('a') | |||||
if anchor['href'] not in all_categories.values(): | |||||
anchor_title = anchor['title'].split(' | ')[0] | |||||
process_categories[anchor_title] = f'{anchor["href"]}' | |||||
all_categories[anchor_title] = anchor['href'] | |||||
#print(f'added {cat["data-href"]}') | |||||
for product in page_products: | |||||
anchor = product.find("a") | |||||
if anchor['href'] not in products.values(): | |||||
anchor_title = anchor['title'].split(' | ')[0] | |||||
if anchor_title.upper().find("CADEAU") > -1: | |||||
continue | |||||
products[anchor_title] = anchor["href"] | |||||
thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title)) | |||||
threads.append(thread) | |||||
thread.start() | |||||
next_page = soup.find("a", class_="next") | |||||
if next_page is None: | |||||
url = next_url(process_categories) | |||||
else: | |||||
url = next_page["href"] | |||||
for t in threads: | |||||
t.join() | |||||
common.update_store_prices(producer, config['brouwland']) |
@ -0,0 +1,180 @@ | |||||
#!/usr/bin/env python3 | |||||
import requests | |||||
import logging | |||||
import xml.etree.ElementTree as ET | |||||
import json | |||||
import unidecode | |||||
import sqlite3 | |||||
import common | |||||
import threading | |||||
import re | |||||
import traceback | |||||
import sys | |||||
from bs4 import BeautifulSoup | |||||
from datetime import datetime | |||||
from kafka import KafkaProducer | |||||
from ratelimit import limits, sleep_and_retry | |||||
def get_categories(url): | |||||
soup = dreamland_get_soup(url) | |||||
categories = {} | |||||
main_categories = soup.find_all("p", class_="subNav__categoryTitle") | |||||
for cat in main_categories: | |||||
anchor = cat.find("a") | |||||
categories[anchor.text] = anchor['href'] | |||||
return categories | |||||
@sleep_and_retry | |||||
@limits(calls=1, period=2) | |||||
def dreamland_get_soup(url, payload=None, cookie=None, headers={}): | |||||
return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers) | |||||
def get_product_details(prod_url, config, kafka_producer): | |||||
try: | |||||
soup = dreamland_get_soup(prod_url) | |||||
product = {} | |||||
price = {} | |||||
sku = soup.find("span", class_="sku").text.split(":")[1] | |||||
title = soup.find("h1", class_="main_header").text | |||||
image = soup.find("img", {'id': 'productMainImage'})['src'] | |||||
desc = soup.find("div", class_="product_text").text | |||||
attrs = soup.find("div", {'id': 'Attributes_table'}) | |||||
items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')}) | |||||
ean_list = [] | |||||
for item in items: | |||||
if item.text.find('Merk') > -1: | |||||
product['brand'] = item.find("span").text.strip('">').strip() | |||||
if item.text.find('EAN') > -1: | |||||
ean_codes= item.find_all("span") | |||||
for code in ean_codes: | |||||
ean = code.text.strip('">').strip() | |||||
ean_list.append(ean) | |||||
product['sku_code'] = sku | |||||
product['url'] = prod_url | |||||
product['title'] = title | |||||
product['image_url'] = image | |||||
product['info'] = desc | |||||
product['ean_code'] = ", ".join(ean_list) | |||||
if soup.find("div", class_="price red mini") is None: | |||||
price['price'] = soup.find("div", class_="product_price").text.strip("€\xa0") | |||||
price['promo'] = 0 | |||||
else: | |||||
price['price'] = soup.find("div", class_="price red mini").text.strip("€\xa0") | |||||
price['promo'] = 1 | |||||
common.add_product(kafka_producer, config['dreamland']['name'], product, price) | |||||
except Exception as err: | |||||
common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc()) | |||||
def get_productListView(soup): | |||||
scripts = soup.find_all("script") | |||||
list_view_pattern = "\'(ProductListingView.*)',{" | |||||
listview = None | |||||
if scripts is None: | |||||
return listview | |||||
for script in scripts: | |||||
if script.string is not None: | |||||
if script.string.find("ProductListingView") > 0: | |||||
listview_match = re.search(list_view_pattern, script.string) | |||||
listview = listview_match.group(1) | |||||
return listview | |||||
def next_url(url_list): | |||||
if len(url_list) == 0: | |||||
return None | |||||
key = next(iter(url_list)) | |||||
url = url_list[key] | |||||
del url_list[key] | |||||
return url | |||||
def get_dreamland_productListingView(url, index=0): | |||||
soup = None | |||||
#payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax' | |||||
headers = {} | |||||
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' | |||||
headers['content-type'] = 'application/x-www-form-urlencoded' | |||||
payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax' | |||||
soup = dreamland_get_soup(url, payload=payload, headers=headers) | |||||
return soup | |||||
maxthreads = 2 | |||||
sema = threading.Semaphore(value=maxthreads) | |||||
threads = list() | |||||
if __name__ == "__main__": | |||||
config = common.get_config() | |||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||||
common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']}) | |||||
common.clear_failed_product(config['dreamland']['name']) | |||||
all_categories = get_categories(config['dreamland']['url']) | |||||
process_categories = all_categories.copy() | |||||
url = next_url(process_categories) | |||||
products = {} | |||||
i = 0 | |||||
while url is not None: | |||||
i = i + 1 | |||||
print(f'{i}/{len(all_categories)} - {len(process_categories)}') | |||||
soup = common.get_soup_page(url) | |||||
if soup is None: | |||||
url = next_url(process_categories) | |||||
continue | |||||
product_listview = get_productListView(soup) | |||||
extra_cat = soup.find_all("li", class_="singleFacet") | |||||
if len(extra_cat) > 0: | |||||
for cat in extra_cat: | |||||
anchor = cat.find("a") | |||||
if anchor['href'] not in all_categories.values(): | |||||
anchor_title = anchor.find("span", class_="facetName").text | |||||
#print(f'added {anchor_title}- {anchor["href"]}') | |||||
process_categories[anchor_title] = anchor["href"] | |||||
all_categories[anchor_title] = anchor['href'] | |||||
index = 0 | |||||
while product_listview is not None: | |||||
view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index) | |||||
#view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}') | |||||
view_products = view_soup.find_all("div", class_="product_info") | |||||
for product in view_products: | |||||
index = index + 1 | |||||
name = product.find("div", class_="product_name") | |||||
anchor = product.find("a") | |||||
if anchor['href'] not in products.values(): | |||||
products[name] = anchor["href"] | |||||
thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer)) | |||||
threads.append(thread) | |||||
thread.start() | |||||
next_arrow = view_soup.find("a", class_="right_arrow") | |||||
if next_arrow is None: | |||||
product_listview = None | |||||
url = next_url(process_categories) | |||||
for t in threads: | |||||
t.join() | |||||
common.update_store_prices(producer, config['dreamland']) |
@ -0,0 +1,166 @@ | |||||
#!/usr/bin/env python3 | |||||
import requests | |||||
import logging | |||||
import xml.etree.ElementTree as ET | |||||
import json | |||||
import unidecode | |||||
import sqlite3 | |||||
import common | |||||
import threading | |||||
import traceback | |||||
from bs4 import BeautifulSoup | |||||
from datetime import datetime | |||||
from kafka import KafkaProducer | |||||
def get_categories(cat_url): | |||||
page = requests.get(cat_url) | |||||
content = page.content[1:-1].decode('utf-8') | |||||
content = content.replace("\\r\\n", "") | |||||
content = content.replace('\\/', '/') | |||||
content = content.replace('\\"', '"') | |||||
content = content.replace(' & ', '&') | |||||
root = ET.fromstring(content) | |||||
categories = {} | |||||
for elm in list(root.iter()): | |||||
if elm.tag == "a": | |||||
if (elm.attrib['href'] == '#') or (elm.text is None): | |||||
continue | |||||
if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0): | |||||
continue | |||||
if elm.attrib['href'] not in categories.values(): | |||||
categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "") | |||||
return categories | |||||
def next_url(url_list): | |||||
if len(url_list) == 0: | |||||
return None | |||||
key = next(iter(url_list)) | |||||
url = url_list[key] | |||||
del url_list[key] | |||||
return url | |||||
def get_product_details(prod_url, config, kafka_producer): | |||||
soup = common.get_soup_page_no_limit(prod_url) | |||||
if soup is None: | |||||
return | |||||
try: | |||||
prod_view = soup.find("div", class_="product-view") | |||||
prod_essential = prod_view.find("div", class_="product-essential") | |||||
image = prod_view.find("img", {'itemprop': 'image'})['src'] | |||||
price = prod_essential.find("meta", {'itemprop': 'price'})['content'] | |||||
special_price = prod_essential.find("p", class_="special-price") | |||||
promo = False | |||||
promo_end = None | |||||
if special_price is not None: | |||||
promo = True | |||||
promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"}) | |||||
if promo_end is not None: | |||||
promo_end = promo_end['content'] | |||||
title = prod_view.find("h1", {'itemprop': 'name'}).text | |||||
sku = prod_view.find("meta", {'itemprop': 'sku'})['content'] | |||||
brand = prod_view.find("meta", {'itemprop': 'brand'})['content'] | |||||
description = prod_view.find("meta", {'itemprop': 'description'})['content'] | |||||
specs = prod_view.find("div", class_="tab-content tab-block-additional") | |||||
spec_li = specs.find_all("li") | |||||
info = '' | |||||
for spec in spec_li: | |||||
label = spec.find("span", class_="label").text | |||||
content = spec.find("span", class_="data").text | |||||
if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1: | |||||
info = f'{info}{label}: {content} / ' | |||||
info = info[:-3] | |||||
ean_code = '' | |||||
ean_list = prod_view.find_all("li") | |||||
for elm in ean_list: | |||||
if elm is not None: | |||||
if elm.text.upper().find("EAN") >= 0: | |||||
ean_code = elm.find("span", class_="data").text | |||||
product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info} | |||||
price_details = {'price': price, "promo": promo, "promo_end": promo_end} | |||||
common.add_product(kafka_producer, config['fun']['name'], product_details, price_details) | |||||
except Exception as err: | |||||
common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc()) | |||||
maxthreads = 10 | |||||
sema = threading.Semaphore(value=maxthreads) | |||||
threads = list() | |||||
if __name__ == "__main__": | |||||
config = common.get_config() | |||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||||
common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']}) | |||||
common.clear_failed_product(config['fun']['name']) | |||||
all_categories = get_categories(config['fun']['categories_url']) | |||||
process_categories = all_categories.copy() | |||||
url = next_url(process_categories) | |||||
products = {} | |||||
i = 0 | |||||
while url is not None: | |||||
i = i + 1 | |||||
print(f'{i}/{len(all_categories)} - {len(process_categories)}') | |||||
cat_url = config["fun"]["url"] + url | |||||
soup = common.get_soup_page_no_limit(cat_url) | |||||
subcat = soup.find_all("li", class_="item last") | |||||
if len(subcat) == 0: | |||||
big_urls = soup.find_all("big") | |||||
for b_url in big_urls: | |||||
b_href = b_url.find("a") | |||||
if b_href: | |||||
try: | |||||
b_url = b_href['href'].replace(config["fun"]["url"], "") | |||||
if b_url not in all_categories.values(): | |||||
process_categories[b_href.text] = b_url | |||||
all_categories[b_href.text] = b_url | |||||
except Exception as err: | |||||
print(url) | |||||
print("+++++++++++++") | |||||
print(cat_url) | |||||
print("+++++++++++++") | |||||
print(b_href) | |||||
print("+++++++++++++") | |||||
print(err) | |||||
print("=============") | |||||
url = next_url(process_categories) | |||||
else: | |||||
for sc in subcat: | |||||
product = sc.find("h2", class_="product-name") | |||||
p_info = product.find("a") | |||||
if p_info['href'] not in products.values(): | |||||
products[p_info['title']] = p_info['href'] | |||||
thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer)) | |||||
threads.append(thread) | |||||
thread.start() | |||||
next_page = soup.find("a", class_="next i-next") | |||||
if next_page is None: | |||||
url = next_url(process_categories) | |||||
else: | |||||
url = next_page['href'].replace(config["fun"]["url"], "") | |||||
for t in threads: | |||||
t.join() | |||||
common.update_store_prices(producer, config['fun']) |
@ -0,0 +1,95 @@ | |||||
#!/usr/bin/env python3 | |||||
import requests | |||||
import logging | |||||
import xml.etree.ElementTree as ET | |||||
import json | |||||
import unidecode | |||||
import sqlite3 | |||||
import common | |||||
import threading | |||||
import re | |||||
import traceback | |||||
import sys | |||||
from bs4 import BeautifulSoup | |||||
from datetime import datetime | |||||
from kafka import KafkaProducer | |||||
def get_product_details(prod_url, config, kafka_producer, json_product): | |||||
try: | |||||
soup = common.get_soup_page_no_limit(prod_url) | |||||
description = soup.find('div', class_='pdp-description__content') | |||||
ean_row = soup.find_all('div', class_='row border-bottom') | |||||
product = {} | |||||
price = {} | |||||
product['title'] = json_product['title'] | |||||
product['url'] = prod_url | |||||
if 'image' in json_product: | |||||
product['image_url'] = json_product['image'] | |||||
if description.find("p") is not None: | |||||
product['info'] = description.find("p").text | |||||
product['sku_code'] = json_product['sku'] | |||||
for row in ean_row: | |||||
if row.text.find('EAN- / barcode') > -1: | |||||
ean_pattern = "([0-9]{8,13})" | |||||
ean_match = re.search(ean_pattern, row.text) | |||||
product['ean_code'] = ean_match.group(1) | |||||
if 'brand' in json_product: | |||||
product['brand'] = json_product['brand'] | |||||
price['price'] = json_product['price'] | |||||
if 'discount' in json_product: | |||||
price['promo'] = 1 | |||||
else: | |||||
price['promo'] = 0 | |||||
common.add_product(kafka_producer, config['hubo']['name'], product, price) | |||||
except Exception as err: | |||||
common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc()) | |||||
def next_url(url_list): | |||||
if len(url_list) == 0: | |||||
return None | |||||
key = next(iter(url_list)) | |||||
url = url_list[key] | |||||
del url_list[key] | |||||
return url | |||||
maxthreads = 5 | |||||
sema = threading.Semaphore(value=maxthreads) | |||||
threads = list() | |||||
if __name__ == "__main__" : | |||||
config = common.get_config() | |||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) | |||||
common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']}) | |||||
common.clear_failed_product(config['hubo']['name']) | |||||
get_doc = requests.get(config['hubo']['products_url']) | |||||
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore'))) | |||||
offset = 1 | |||||
while len(data_json['docs']) > 0: | |||||
print(f'{offset}/{data_json["doc_count"]}') | |||||
for product in data_json['docs']: | |||||
thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product)) | |||||
threads.append(thread) | |||||
thread.start() | |||||
offset = offset + int(data_json['limit']) | |||||
get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset))) | |||||
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore'))) | |||||
for t in threads: | |||||
t.join() | |||||
common.update_store_prices(producer, config['hubo']) |