first commit
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,54 @@
|
||||
config:
|
||||
loglevel: logging.DEBUG
|
||||
log_sql: False
|
||||
sqlitedb: cheap_shopper.db
|
||||
active_stores: action, fun, brico
|
||||
kafka_boostrap_servers: apollo.home.lan
|
||||
action:
|
||||
name: Action
|
||||
url: https://www.action.com
|
||||
logo: https://upload.wikimedia.org/wikipedia/commons/4/45/Action_Nederland_Logo_2020.svg
|
||||
categories_url: https://www.action.com/api/navigation/categories?language=nl-BE
|
||||
products_url: https://www.action.com/api/subcategories/products/?routeSegment=__url__&offset=__offset__&limit=__limit__
|
||||
products_cookie: {'epslanguage': 'nl-BE'}
|
||||
brico:
|
||||
name: Brico
|
||||
url: https://www.brico.be
|
||||
logo: https://vdm.bricocdn.be/logos/brico.svg?ie
|
||||
cookie: {'language': 'nl'}
|
||||
brouwland:
|
||||
name: Brouwland
|
||||
url: https://www.brouwland.com
|
||||
logo: https://brouwlandprod-yappa.netdna-ssl.com/build/images/logo-brouwland2.2fc01423.png
|
||||
catalogue_url: https://www.brouwland.com/nl/onze-producten
|
||||
dreamland:
|
||||
name: Dreamland
|
||||
url: https://www.dreamland.be/e/nl/dl
|
||||
logo: https://seeklogo.com/images/D/dreamland-be-logo-7F2C0508F2-seeklogo.com.png
|
||||
delhaize:
|
||||
name: Delhaize
|
||||
url: https://delhaize.be/shop
|
||||
logo:
|
||||
fun:
|
||||
name: Fun
|
||||
url: https://www.fun.be
|
||||
logo: https://e7.pngegg.com/pngimages/935/634/png-clipart-fun-logo-fun-toyshop-logo-icons-logos-emojis-toy-shop-logos.png
|
||||
categories_url: https://www.fun.be/seall/menu/index
|
||||
gamma:
|
||||
name: Gamma
|
||||
url: https://www.gamma.be
|
||||
logo: https://nl.wikipedia.org/wiki/Gamma_(winkel)#/media/Bestand:Gamma_logo_2010.png
|
||||
categories_url: https://www.gamma.be/nl/resources/menu/categories
|
||||
hubo:
|
||||
name: Hubo
|
||||
url: https://www.hubo.be
|
||||
logo: https://www.hubo.be/content/dam/hubo/site-afbeeldingen/hubo-algemeen/logo.svg
|
||||
categories_url: https://www.hubo.be/nl/a.html
|
||||
products_url: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&limit=60
|
||||
products_url_offset: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&offset=__OFFSET__
|
||||
intratuin_nl:
|
||||
name: Intratuin NL
|
||||
url: https://www.intratuin.nl
|
||||
intratuin_be:
|
||||
name: Intratuin BE
|
||||
url: https://www.intratuin.be
|
||||
@@ -0,0 +1,152 @@
|
||||
import yaml
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import traceback
|
||||
import requests
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from ratelimit import limits, sleep_and_retry
|
||||
from time import sleep
|
||||
from random import randint
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def get_config():
|
||||
with open('cheap_shopper.yaml', 'r') as ymlfile:
|
||||
cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
def add_product(kafka_producer, store, product, price):
|
||||
db_object = {}
|
||||
db_object['type'] = 'product'
|
||||
db_object['store'] = store
|
||||
db_object['product'] = product
|
||||
db_object['price'] = price
|
||||
|
||||
db_object_json = json.dumps(db_object)
|
||||
db_object_bytes = bytearray(db_object_json, 'utf-8')
|
||||
|
||||
send_kafka_message(kafka_producer, db_object_bytes)
|
||||
|
||||
|
||||
def insert_update_store(kafka_producer, store):
|
||||
db_object = {}
|
||||
db_object['type'] = 'store'
|
||||
db_object['store'] = store
|
||||
|
||||
db_object_json = json.dumps(db_object)
|
||||
db_object_bytes = bytearray(db_object_json, 'utf-8')
|
||||
|
||||
send_kafka_message(kafka_producer, db_object_bytes)
|
||||
|
||||
|
||||
def send_kafka_message(kafka_producer, message):
|
||||
kafka_producer.send('shopper_db', message)
|
||||
|
||||
|
||||
def update_store_prices(kafka_producer, config):
|
||||
insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')})
|
||||
|
||||
db_object = {}
|
||||
db_object['type'] = 'store_update'
|
||||
db_object['store'] = config['name']
|
||||
|
||||
db_object_json = json.dumps(db_object)
|
||||
db_object_bytes = bytearray(db_object_json, 'utf-8')
|
||||
send_kafka_message(kafka_producer, db_object_bytes)
|
||||
print(f'updating prices {db_object_json}')
|
||||
|
||||
|
||||
def clear_failed_product(store):
|
||||
if not os.path.exists('failed'):
|
||||
os.mkdir('failed')
|
||||
if not os.path.exists(f'failed/{store}'):
|
||||
os.mkdir(f'failed/{store}')
|
||||
return
|
||||
else:
|
||||
for dir in os.listdir(f'failed/{store}/'):
|
||||
shutil.rmtree(f'failed/{store}/{dir}')
|
||||
|
||||
|
||||
def dump_failed_product(store, prod_url, page, err, trace):
|
||||
if not os.path.exists('failed'):
|
||||
os.mkdir('failed')
|
||||
if not os.path.exists(f'failed/{store}'):
|
||||
os.mkdir(f'failed/{store}')
|
||||
dirname = prod_url.replace('https://','')
|
||||
dirname = dirname.replace('.','-')
|
||||
dirname = dirname.replace('/','_')
|
||||
os.mkdir(f'failed/{store}/{dirname}')
|
||||
err_file = open(f'failed/{store}/{dirname}/error.txt', "w")
|
||||
err_file.write(f'{prod_url}\n')
|
||||
err_file.write('===========================================\n')
|
||||
err_file.write(f'{str(err)}\n')
|
||||
err_file.write('===========================================\n')
|
||||
err_file.write(str(trace))
|
||||
err_file.close()
|
||||
page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8")
|
||||
page_file.write(str(page))
|
||||
page_file.close()
|
||||
|
||||
def get_proxies():
|
||||
page = requests.get("https://free-proxy-list.net/")
|
||||
soup = BeautifulSoup(page.content, "html.parser")
|
||||
raw_div = soup.find("div", {"id": "raw"})
|
||||
raw_textarea = raw_div.find("textarea")
|
||||
ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text)
|
||||
ip_dict = {}
|
||||
for ip in ip_list:
|
||||
ip_addr = ip.split(":")[0]
|
||||
ip_port = ip.split(":")[1]
|
||||
ip_dict[ip_addr] = ip_port
|
||||
return ip_list
|
||||
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=2, period=1)
|
||||
def get_soup_page(url, cookie=None):
|
||||
#print(f'get page for soup: {url}')
|
||||
sleep(randint(1,2))
|
||||
soup = None
|
||||
try:
|
||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'}
|
||||
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
|
||||
page = requests.get(url, cookies=cookie, headers=headers)
|
||||
if page.status_code != 200 and page.status_code != 301:
|
||||
return None
|
||||
soup = BeautifulSoup(page.content, "html.parser")
|
||||
except ConnectionRefusedError:
|
||||
print(traceback.format_exc())
|
||||
sleep(randint(2,3))
|
||||
soup = get_soup_page(url)
|
||||
except Exception as err:
|
||||
print(traceback.format_exc())
|
||||
print(err)
|
||||
|
||||
return soup
|
||||
|
||||
def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}):
|
||||
#print(f'get page for soup: {url}')
|
||||
sleep(randint(1,2))
|
||||
soup = None
|
||||
try:
|
||||
if len(headers) == 0:
|
||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
|
||||
'content-type':'text/html;charset=UTF-8'}
|
||||
if 'user-agent' not in headers:
|
||||
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
|
||||
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'
|
||||
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
|
||||
page = requests.get(url, cookies=cookie, data=payload, headers=headers)
|
||||
#print(page.content)
|
||||
if page.status_code != 200 and page.status_code != 301:
|
||||
return None
|
||||
soup = BeautifulSoup(page.content, "html.parser")
|
||||
except Exception as err:
|
||||
print(traceback.format_exc())
|
||||
print(err)
|
||||
return soup
|
||||
+187
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sqlite3
|
||||
import common
|
||||
import json
|
||||
import time
|
||||
import threading
|
||||
|
||||
from datetime import datetime
|
||||
from kafka import KafkaConsumer
|
||||
|
||||
|
||||
def committer(connection):
|
||||
while True:
|
||||
connection.commit()
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
def insert_update_store(connection, stores, value_json):
|
||||
cursor = connection.cursor()
|
||||
|
||||
columns = '('
|
||||
values_sql = f'('
|
||||
values = []
|
||||
update_sql = ''
|
||||
update_values = []
|
||||
|
||||
for dict_key in value_json['store']:
|
||||
columns = f'{columns}{dict_key}, '
|
||||
values.append(value_json['store'][dict_key])
|
||||
values_sql = f'{values_sql}?, '
|
||||
if dict_key != 'store':
|
||||
update_sql = f'{update_sql}{dict_key}=?, '
|
||||
update_values.append(value_json['store'][dict_key])
|
||||
|
||||
columns = columns[:-2] + ')'
|
||||
values_sql = values_sql[:-2] + ')'
|
||||
update_sql = update_sql[:-2]
|
||||
update_values.append(value_json["store"]["store"])
|
||||
|
||||
try:
|
||||
sql_statement = f'INSERT INTO store {columns} VALUES {values_sql}'
|
||||
cursor.execute(sql_statement, values)
|
||||
stores[value_json["store"]["store"]] = cursor.lastrowid
|
||||
except sqlite3.IntegrityError as err:
|
||||
try:
|
||||
sql_statement = f'UPDATE store SET {update_sql} WHERE store=?'
|
||||
cursor.execute(sql_statement, tuple(update_values))
|
||||
except Exception as err:
|
||||
print(err)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
|
||||
|
||||
def insert_update_product(connection, stores, value_json):
|
||||
cursor = connection.cursor()
|
||||
|
||||
columns = '(store, '
|
||||
values_sql = f'(?, '
|
||||
values = [stores[value_json["store"]]]
|
||||
update_sql = ''
|
||||
update_values = []
|
||||
|
||||
for dict_key in value_json['product']:
|
||||
columns = f'{columns}{dict_key}, '
|
||||
values.append(value_json['product'][dict_key])
|
||||
values_sql = f'{values_sql}?, '
|
||||
if dict_key != 'sku_code':
|
||||
update_sql = f'{update_sql}{dict_key}=?, '
|
||||
update_values.append(value_json['product'][dict_key])
|
||||
|
||||
columns = columns[:-2] + ')'
|
||||
values_sql = values_sql[:-2] + ')'
|
||||
update_sql = update_sql[:-2]
|
||||
update_values.append(stores[value_json["store"]])
|
||||
update_values.append(value_json["product"]["sku_code"])
|
||||
|
||||
product_id = None
|
||||
|
||||
try:
|
||||
sql_statement = f'INSERT INTO products {columns} VALUES {values_sql}'
|
||||
cursor.execute(sql_statement, values)
|
||||
product_id = cursor.lastrowid
|
||||
#print(f'inserted {product_id}')
|
||||
except sqlite3.IntegrityError as err:
|
||||
try:
|
||||
sql_statement = f'UPDATE products SET {update_sql} WHERE store=? and sku_code=?'
|
||||
cursor.execute(sql_statement, tuple(update_values))
|
||||
sql_statement = f'SELECT id FROM products WHERE store=? and sku_code=?'
|
||||
cursor.execute(sql_statement, (stores[value_json["store"]], value_json["product"]["sku_code"]))
|
||||
product_id = cursor.fetchone()[0]
|
||||
#print(f'updated {product_id}')
|
||||
except Exception as err:
|
||||
print(err)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
|
||||
insert_update_price(connection, stores, value_json, product_id)
|
||||
|
||||
def insert_update_price(connection, stores, value_json, product_id):
|
||||
cursor = connection.cursor()
|
||||
|
||||
columns = '(product_id, last_update, '
|
||||
values_sql = f'(?, ?, '
|
||||
values = [product_id, datetime.now().strftime('%d/%m/%Y')]
|
||||
|
||||
for dict_key in value_json['price']:
|
||||
columns = f'{columns}{dict_key}, '
|
||||
values.append(value_json['price'][dict_key])
|
||||
values_sql = f'{values_sql}?, '
|
||||
|
||||
columns = columns[:-2] + ')'
|
||||
values_sql = values_sql[:-2] + ')'
|
||||
|
||||
price_exists = False
|
||||
try:
|
||||
sql_statement = f'SELECT id FROM price WHERE product_id=? and price=? and active=1'
|
||||
cursor.execute(sql_statement, (product_id, value_json["price"]["price"]))
|
||||
if cursor.fetchone():
|
||||
price_exists = True
|
||||
except Exception as err:
|
||||
print(err)
|
||||
try:
|
||||
if not price_exists:
|
||||
sql_statement = f'INSERT INTO price {columns} VALUES {values_sql}'
|
||||
cursor.execute(sql_statement, values)
|
||||
else:
|
||||
sql_statement = f'UPDATE price SET last_update=? WHERE product_id=? and price=? and active=1'
|
||||
cursor.execute(sql_statement, (datetime.now().strftime("%d/%m/%Y"), product_id, value_json["price"]["price"]))
|
||||
except sqlite3.IntegrityError as err:
|
||||
print(err)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
|
||||
|
||||
def deactivate_old_price(connection, stores, value_json):
|
||||
cursor = connection.cursor()
|
||||
try:
|
||||
sql_statement=f'UPDATE price SET active=0 WHERE product_id IN (SELECT id FROM products WHERE store=?) AND last_update<?'
|
||||
cursor.execute(sql_statement, (stores[value_json["store"]], datetime.now().strftime("%d/%m/%Y")))
|
||||
#print(sql_statement)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
|
||||
def insert(connection, stores, value_json):
|
||||
if value_json['type'] == 'store':
|
||||
print('insert_update_store')
|
||||
insert_update_store(connection, stores, value_json)
|
||||
|
||||
if value_json['type'] == 'store_update':
|
||||
print('deactivate_old_price')
|
||||
deactivate_old_price(connection, stores, value_json)
|
||||
|
||||
if value_json['type'] == 'product':
|
||||
#print('insert_update_product')
|
||||
insert_update_product(connection, stores, value_json)
|
||||
|
||||
|
||||
def get_stores(connection):
|
||||
stores = {}
|
||||
cursor = con.cursor()
|
||||
|
||||
sql_statement = 'SELECT store, id FROM store'
|
||||
cursor.execute(sql_statement)
|
||||
result = cursor.fetchall()
|
||||
for res in result:
|
||||
stores[res[0]] = res[1]
|
||||
|
||||
return stores
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
config = common.get_config()
|
||||
|
||||
con = sqlite3.connect(config['config']['sqlitedb'], check_same_thread=False)
|
||||
if config['config']['log_sql']:
|
||||
con.set_trace_callback(print)
|
||||
|
||||
stores = get_stores(con)
|
||||
|
||||
commit_thread = threading.Thread(target=committer, args=(con,), daemon=True)
|
||||
commit_thread.start()
|
||||
|
||||
consumer = KafkaConsumer('shopper_db',bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
||||
for msg in consumer:
|
||||
value_json = json.loads(msg.value.decode("utf-8"))
|
||||
insert(con, stores, value_json)
|
||||
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import requests
|
||||
import logging
|
||||
import xml.etree.ElementTree as ET
|
||||
import json
|
||||
import unidecode
|
||||
import sqlite3
|
||||
import common
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from kafka import KafkaProducer
|
||||
|
||||
|
||||
def get_categories(url):
|
||||
page = requests.get(url)
|
||||
root = ET.fromstring(page.content)
|
||||
|
||||
categories = {}
|
||||
|
||||
for cat_elm in root.findall("./Data/CategoryNavigationViewModel"):
|
||||
cat_label = cat_elm.find("Label")
|
||||
cat_url = cat_elm.find("Url")
|
||||
categories[cat_label.text] = cat_url.text
|
||||
|
||||
return categories
|
||||
|
||||
|
||||
def get_products(config, cat_url, kafka_producer):
|
||||
url = config['action']['products_url']
|
||||
url = url.replace("__url__", cat_url)
|
||||
url = url.replace("__offset__", "0")
|
||||
total_url = url.replace("__limit__", "1")
|
||||
products_cookie = config['action']['products_cookie']
|
||||
|
||||
get_total = requests.get(total_url, cookies=products_cookie)
|
||||
data_json = json.loads(unidecode.unidecode(get_total.content.decode('utf-8', 'ignore')))
|
||||
totalCount = data_json['totalCount']
|
||||
|
||||
all_url = url.replace("__limit__", str(totalCount))
|
||||
products = requests.get(all_url, cookies=products_cookie)
|
||||
products_json = json.loads(unidecode.unidecode(products.content.decode('utf-8', 'ignore')))
|
||||
|
||||
for product_json in products_json['items']:
|
||||
try:
|
||||
if product_json['type'] != 'product':
|
||||
continue
|
||||
product = {}
|
||||
price = {}
|
||||
product['title'] = product_json['title']
|
||||
product['url'] = config['action']['url'] + product_json['url']
|
||||
product['image_url'] = config['action']['url'] + product_json['imageUrl']
|
||||
product['info'] = product_json['subTitle']
|
||||
product['sku_code'] = product_json['code']
|
||||
for spec in product_json['specifications']:
|
||||
if spec['id'] == 'attEANCodeVariant':
|
||||
product['ean_code'] = spec['value']
|
||||
if spec['id'] == 'attLongDescription':
|
||||
product['description'] = spec['value']
|
||||
product['brand'] = product_json['brandName']
|
||||
price['price'] = product_json['price']
|
||||
price['promo'] = 0 if product_json['isDeal'] == False else 1
|
||||
price['promo_start'] = product_json['dealStartDate']
|
||||
price['promo_end'] = product_json['dealEndDate']
|
||||
|
||||
common.add_product(kafka_producer, config['action']['name'], product, price)
|
||||
except Exception as err:
|
||||
common.dump_failed_product(config['action']['name'], cat_url, product_json, err, traceback.format_exc())
|
||||
|
||||
|
||||
if __name__ == "__main__" :
|
||||
config = common.get_config()
|
||||
|
||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
||||
common.insert_update_store(producer, {'store': config['action']['name'], 'url': config['action']['url'], 'image_url': config['action']['logo']})
|
||||
common.clear_failed_product(config['action']['name'])
|
||||
|
||||
categories = get_categories(config['action']['categories_url'])
|
||||
|
||||
for cat in categories:
|
||||
soup = common.get_soup_page_no_limit(f"{config['action']['url']}{categories[cat]}")
|
||||
subcat = soup.find_all("a", class_="subcategory-cta-list__cta")
|
||||
for sc in subcat:
|
||||
get_products(config, sc['href'], producer)
|
||||
|
||||
common.update_store_prices(producer, config['action'])
|
||||
+158
@@ -0,0 +1,158 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import requests
|
||||
import logging
|
||||
import xml.etree.ElementTree as ET
|
||||
import json
|
||||
import unidecode
|
||||
import sqlite3
|
||||
import common
|
||||
import threading
|
||||
import re
|
||||
import traceback
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from kafka import KafkaProducer
|
||||
|
||||
|
||||
def get_categories(url, cookie):
|
||||
soup = common.get_soup_page_no_limit(url, cookie)
|
||||
|
||||
categories = {}
|
||||
|
||||
dropdown = soup.find('div', class_="mxd-mega-dropdown")
|
||||
|
||||
anchors = dropdown.find_all('a')
|
||||
for anchor in anchors:
|
||||
if anchor['href'].find('bricosolar') >= 0:
|
||||
print(f'remove {anchor["href"]}')
|
||||
pass
|
||||
elif anchor['href'] not in categories.values():
|
||||
if anchor['href'].find('https://www.brico.be') == -1:
|
||||
anchor['href'] = f'https://www.brico.be{anchor["href"]}'
|
||||
categories[anchor.text] = anchor['href']
|
||||
|
||||
return categories
|
||||
|
||||
|
||||
def get_product_details(prod_url, config, kafka_producer, failed):
|
||||
try:
|
||||
soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie'])
|
||||
|
||||
prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout')
|
||||
if prod_detail is None:
|
||||
return
|
||||
prod_specs = soup.find('div', {'id': 'specs'})
|
||||
title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1')
|
||||
|
||||
product = {}
|
||||
price = {}
|
||||
product['title'] = title.find("span", {'itemprop': 'name'}).text
|
||||
product['url'] = prod_url
|
||||
product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src']
|
||||
product['info'] = prod_specs.find_all("p")[0].text
|
||||
product['sku_code'] = prod_url.split("/")[-1]
|
||||
ean_pattern = "([0-9]{8,13})"
|
||||
if len(prod_specs.find_all("p")) > 1:
|
||||
ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text)
|
||||
product['ean_code'] = ean_match.group(1)
|
||||
|
||||
pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)'
|
||||
pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"'
|
||||
scripts = soup.find_all("script")
|
||||
for script in scripts:
|
||||
if script.string is not None:
|
||||
if script.string.find('"brands"') >= 0:
|
||||
match = re.search(pattern_brand, script.string)
|
||||
product['brand'] = match.group(1)
|
||||
if script.string.find('"ean"') >= 0 and 'ean_code' not in product:
|
||||
match = re.search(pattern_ean, script.string)
|
||||
product['ean_code'] = match.group(1)
|
||||
if soup.find("ins") is None:
|
||||
return
|
||||
price['price'] = soup.find("ins").find("meta")['content']
|
||||
price['promo'] = 0 if soup.find("del") is None else 1
|
||||
|
||||
common.add_product(kafka_producer, config['brico']['name'], product, price)
|
||||
except Exception as err:
|
||||
common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc())
|
||||
|
||||
|
||||
def next_url(url_list):
|
||||
if len(url_list) == 0:
|
||||
return None
|
||||
|
||||
key = next(iter(url_list))
|
||||
url = url_list[key]
|
||||
del url_list[key]
|
||||
return url
|
||||
|
||||
|
||||
maxthreads = 10
|
||||
sema = threading.Semaphore(value=maxthreads)
|
||||
threads = list()
|
||||
|
||||
if __name__ == "__main__" :
|
||||
config = common.get_config()
|
||||
|
||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
||||
common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']})
|
||||
common.clear_failed_product(config['brico']['name'])
|
||||
|
||||
all_categories = get_categories(config['brico']['url'], config['brico']['cookie'])
|
||||
|
||||
process_categories = all_categories.copy()
|
||||
url = next_url(process_categories)
|
||||
products = {}
|
||||
failed_prod = {}
|
||||
i = 0
|
||||
while url is not None:
|
||||
i = i + 1
|
||||
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
|
||||
|
||||
soup = common.get_soup_page_no_limit(url, config['brico']['cookie'])
|
||||
page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive")
|
||||
extra_cat = soup.find_all("a", class_="mxd-product-list-link")
|
||||
|
||||
if len(extra_cat) > 0:
|
||||
for cat in extra_cat:
|
||||
if cat['href'].find('bricosolar') >= 0:
|
||||
print(f'main remove {cat["href"]}')
|
||||
pass
|
||||
elif cat['href'] not in all_categories.values():
|
||||
cat_name = cat.find("span").text
|
||||
if cat['href'].find("/") > 0:
|
||||
cat['href'] = f'/{cat["href"]}'
|
||||
process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}'
|
||||
#print(f'added {config["brico"]["url"]}{cat["href"]}')
|
||||
all_categories[cat_name] = cat['href']
|
||||
|
||||
for product in page_products:
|
||||
product = product.find("a", class_="mxd-block-card-link")
|
||||
if product['href'].find("/nl") == -1:
|
||||
product['href'] = f'/nl{product["href"]}'
|
||||
if product['href'].find("/") > 0:
|
||||
product['href'] = f'/{product["href"]}'
|
||||
if product['href'] not in products.values():
|
||||
prod_name = product.find("span", {"itemprop": "name"}).text
|
||||
#print(f'product url: {product["href"]}')
|
||||
products[prod_name] = product["href"]
|
||||
|
||||
thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
|
||||
next_page = soup.find("a", {"rel": "next"})
|
||||
if next_page is None:
|
||||
url = next_url(process_categories)
|
||||
else:
|
||||
base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"})
|
||||
url = f'{base["href"]}{next_page["href"]}'
|
||||
print(f'next page: {url}')
|
||||
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
common.update_store_prices(producer, config['brico'])
|
||||
@@ -0,0 +1,153 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import requests
|
||||
import logging
|
||||
import xml.etree.ElementTree as ET
|
||||
import json
|
||||
import unidecode
|
||||
import sqlite3
|
||||
import common
|
||||
import threading
|
||||
import re
|
||||
import traceback
|
||||
import sys
|
||||
import itertools
|
||||
import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
from time import sleep
|
||||
from random import randint
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from kafka import KafkaProducer
|
||||
from ratelimit import limits, sleep_and_retry
|
||||
|
||||
|
||||
|
||||
def get_categories(url):
|
||||
soup = common.get_soup_page(url)
|
||||
|
||||
categories = {}
|
||||
|
||||
articles = soup.find_all('article', class_="item bg-gray")
|
||||
|
||||
for article in articles:
|
||||
anchor = article.find('a')
|
||||
anchor_title = anchor['title'].split(' | ')[0]
|
||||
categories[anchor_title] = anchor['href']
|
||||
|
||||
return categories
|
||||
|
||||
|
||||
def get_product_details(prod_url, config, kafka_producer, anchor_title):
|
||||
try:
|
||||
soup = common.get_soup_page(prod_url)
|
||||
if soup is None:
|
||||
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
|
||||
return
|
||||
|
||||
prod_detail = soup.find('article')
|
||||
if prod_detail is None:
|
||||
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
|
||||
return
|
||||
title = prod_detail.find('h1', {'itemprop': 'name'})
|
||||
image = prod_detail.find('a', class_='image-click').find('img')
|
||||
description = prod_detail.find('div', {'itemprop': 'description'})
|
||||
ean_code = prod_detail.find('td', {'itemprop': 'eancode'})
|
||||
sku = prod_detail.find('span', {'itemprop': 'sku'})
|
||||
brand = prod_detail.find('span', {'itemprop': 'brand'})
|
||||
price_detail = prod_detail.find('span', {'itemprop': 'price'})
|
||||
|
||||
product = {}
|
||||
price = {}
|
||||
if title is not None:
|
||||
product['title'] = title.text.strip()
|
||||
product['url'] = prod_url
|
||||
product['image_url'] = image['src']
|
||||
if description is not None:
|
||||
product['info'] = description.text.strip()
|
||||
if sku is not None:
|
||||
product['sku_code'] = sku.text.strip()
|
||||
if ean_code is not None:
|
||||
product['ean_code'] = ean_code.text.strip()
|
||||
if brand is not None:
|
||||
product['brand'] = brand.text.strip()
|
||||
|
||||
price['price'] = price_detail.text.split()[1]
|
||||
price['promo'] = 0 if soup.find("del") is None else 1
|
||||
|
||||
common.add_product(kafka_producer, config['brouwland']['name'], product, price)
|
||||
except Exception as err:
|
||||
print(traceback.format_exc())
|
||||
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc())
|
||||
|
||||
|
||||
def next_url(url_list):
|
||||
if len(url_list) == 0:
|
||||
return None
|
||||
|
||||
key = next(iter(url_list))
|
||||
url = url_list[key]
|
||||
del url_list[key]
|
||||
return url
|
||||
|
||||
|
||||
maxthreads = 5
|
||||
sema = threading.Semaphore(value=maxthreads)
|
||||
threads = list()
|
||||
|
||||
if __name__ == "__main__" :
|
||||
config = common.get_config()
|
||||
|
||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
||||
common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']})
|
||||
common.clear_failed_product(config['brouwland']['name'])
|
||||
|
||||
all_categories = get_categories(config['brouwland']['catalogue_url'])
|
||||
|
||||
process_categories = all_categories.copy()
|
||||
url = next_url(process_categories)
|
||||
products = {}
|
||||
i = 0
|
||||
while url is not None:
|
||||
i = i + 1
|
||||
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
|
||||
url = f'{config["brouwland"]["url"]}{url}'
|
||||
soup = common.get_soup_page(url)
|
||||
if soup is None:
|
||||
url = next_url(process_categories)
|
||||
continue
|
||||
page_products = soup.find_all("article", class_="product")
|
||||
extra_cat = soup.find_all("article", class_="item bg-gray")
|
||||
|
||||
if len(extra_cat) > 0:
|
||||
for cat in extra_cat:
|
||||
anchor = cat.find('a')
|
||||
if anchor['href'] not in all_categories.values():
|
||||
anchor_title = anchor['title'].split(' | ')[0]
|
||||
process_categories[anchor_title] = f'{anchor["href"]}'
|
||||
all_categories[anchor_title] = anchor['href']
|
||||
#print(f'added {cat["data-href"]}')
|
||||
|
||||
for product in page_products:
|
||||
anchor = product.find("a")
|
||||
if anchor['href'] not in products.values():
|
||||
anchor_title = anchor['title'].split(' | ')[0]
|
||||
if anchor_title.upper().find("CADEAU") > -1:
|
||||
continue
|
||||
products[anchor_title] = anchor["href"]
|
||||
|
||||
thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
|
||||
next_page = soup.find("a", class_="next")
|
||||
if next_page is None:
|
||||
url = next_url(process_categories)
|
||||
else:
|
||||
url = next_page["href"]
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
common.update_store_prices(producer, config['brouwland'])
|
||||
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import requests
|
||||
import logging
|
||||
import xml.etree.ElementTree as ET
|
||||
import json
|
||||
import unidecode
|
||||
import sqlite3
|
||||
import common
|
||||
import threading
|
||||
import re
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from kafka import KafkaProducer
|
||||
from ratelimit import limits, sleep_and_retry
|
||||
|
||||
|
||||
def get_categories(url):
|
||||
soup = dreamland_get_soup(url)
|
||||
|
||||
categories = {}
|
||||
|
||||
main_categories = soup.find_all("p", class_="subNav__categoryTitle")
|
||||
for cat in main_categories:
|
||||
anchor = cat.find("a")
|
||||
categories[anchor.text] = anchor['href']
|
||||
|
||||
return categories
|
||||
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=1, period=2)
|
||||
def dreamland_get_soup(url, payload=None, cookie=None, headers={}):
|
||||
return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers)
|
||||
|
||||
|
||||
def get_product_details(prod_url, config, kafka_producer):
|
||||
try:
|
||||
soup = dreamland_get_soup(prod_url)
|
||||
|
||||
product = {}
|
||||
price = {}
|
||||
sku = soup.find("span", class_="sku").text.split(":")[1]
|
||||
title = soup.find("h1", class_="main_header").text
|
||||
image = soup.find("img", {'id': 'productMainImage'})['src']
|
||||
desc = soup.find("div", class_="product_text").text
|
||||
attrs = soup.find("div", {'id': 'Attributes_table'})
|
||||
items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')})
|
||||
ean_list = []
|
||||
for item in items:
|
||||
if item.text.find('Merk') > -1:
|
||||
product['brand'] = item.find("span").text.strip('">').strip()
|
||||
if item.text.find('EAN') > -1:
|
||||
ean_codes= item.find_all("span")
|
||||
for code in ean_codes:
|
||||
ean = code.text.strip('">').strip()
|
||||
ean_list.append(ean)
|
||||
|
||||
product['sku_code'] = sku
|
||||
product['url'] = prod_url
|
||||
product['title'] = title
|
||||
product['image_url'] = image
|
||||
product['info'] = desc
|
||||
product['ean_code'] = ", ".join(ean_list)
|
||||
|
||||
|
||||
if soup.find("div", class_="price red mini") is None:
|
||||
price['price'] = soup.find("div", class_="product_price").text.strip("€\xa0")
|
||||
price['promo'] = 0
|
||||
else:
|
||||
price['price'] = soup.find("div", class_="price red mini").text.strip("€\xa0")
|
||||
price['promo'] = 1
|
||||
|
||||
common.add_product(kafka_producer, config['dreamland']['name'], product, price)
|
||||
except Exception as err:
|
||||
common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc())
|
||||
|
||||
|
||||
def get_productListView(soup):
|
||||
scripts = soup.find_all("script")
|
||||
list_view_pattern = "\'(ProductListingView.*)',{"
|
||||
listview = None
|
||||
if scripts is None:
|
||||
return listview
|
||||
for script in scripts:
|
||||
if script.string is not None:
|
||||
if script.string.find("ProductListingView") > 0:
|
||||
listview_match = re.search(list_view_pattern, script.string)
|
||||
listview = listview_match.group(1)
|
||||
return listview
|
||||
|
||||
|
||||
def next_url(url_list):
|
||||
if len(url_list) == 0:
|
||||
return None
|
||||
|
||||
key = next(iter(url_list))
|
||||
url = url_list[key]
|
||||
del url_list[key]
|
||||
return url
|
||||
|
||||
|
||||
def get_dreamland_productListingView(url, index=0):
|
||||
soup = None
|
||||
#payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax'
|
||||
headers = {}
|
||||
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
|
||||
headers['content-type'] = 'application/x-www-form-urlencoded'
|
||||
payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax'
|
||||
|
||||
soup = dreamland_get_soup(url, payload=payload, headers=headers)
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
maxthreads = 2
|
||||
sema = threading.Semaphore(value=maxthreads)
|
||||
threads = list()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = common.get_config()
|
||||
|
||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
||||
common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']})
|
||||
common.clear_failed_product(config['dreamland']['name'])
|
||||
|
||||
all_categories = get_categories(config['dreamland']['url'])
|
||||
|
||||
process_categories = all_categories.copy()
|
||||
url = next_url(process_categories)
|
||||
products = {}
|
||||
i = 0
|
||||
while url is not None:
|
||||
i = i + 1
|
||||
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
|
||||
soup = common.get_soup_page(url)
|
||||
if soup is None:
|
||||
url = next_url(process_categories)
|
||||
continue
|
||||
|
||||
product_listview = get_productListView(soup)
|
||||
extra_cat = soup.find_all("li", class_="singleFacet")
|
||||
|
||||
if len(extra_cat) > 0:
|
||||
for cat in extra_cat:
|
||||
anchor = cat.find("a")
|
||||
if anchor['href'] not in all_categories.values():
|
||||
anchor_title = anchor.find("span", class_="facetName").text
|
||||
#print(f'added {anchor_title}- {anchor["href"]}')
|
||||
process_categories[anchor_title] = anchor["href"]
|
||||
all_categories[anchor_title] = anchor['href']
|
||||
index = 0
|
||||
while product_listview is not None:
|
||||
view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index)
|
||||
#view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}')
|
||||
view_products = view_soup.find_all("div", class_="product_info")
|
||||
for product in view_products:
|
||||
index = index + 1
|
||||
name = product.find("div", class_="product_name")
|
||||
anchor = product.find("a")
|
||||
if anchor['href'] not in products.values():
|
||||
products[name] = anchor["href"]
|
||||
|
||||
thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
next_arrow = view_soup.find("a", class_="right_arrow")
|
||||
if next_arrow is None:
|
||||
product_listview = None
|
||||
|
||||
url = next_url(process_categories)
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
common.update_store_prices(producer, config['dreamland'])
|
||||
+166
@@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import requests
|
||||
import logging
|
||||
import xml.etree.ElementTree as ET
|
||||
import json
|
||||
import unidecode
|
||||
import sqlite3
|
||||
import common
|
||||
import threading
|
||||
import traceback
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from kafka import KafkaProducer
|
||||
|
||||
|
||||
def get_categories(cat_url):
|
||||
page = requests.get(cat_url)
|
||||
content = page.content[1:-1].decode('utf-8')
|
||||
content = content.replace("\\r\\n", "")
|
||||
content = content.replace('\\/', '/')
|
||||
content = content.replace('\\"', '"')
|
||||
content = content.replace(' & ', '&')
|
||||
root = ET.fromstring(content)
|
||||
|
||||
categories = {}
|
||||
for elm in list(root.iter()):
|
||||
if elm.tag == "a":
|
||||
if (elm.attrib['href'] == '#') or (elm.text is None):
|
||||
continue
|
||||
if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0):
|
||||
continue
|
||||
if elm.attrib['href'] not in categories.values():
|
||||
categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "")
|
||||
return categories
|
||||
|
||||
|
||||
def next_url(url_list):
|
||||
if len(url_list) == 0:
|
||||
return None
|
||||
|
||||
key = next(iter(url_list))
|
||||
url = url_list[key]
|
||||
del url_list[key]
|
||||
return url
|
||||
|
||||
|
||||
def get_product_details(prod_url, config, kafka_producer):
|
||||
soup = common.get_soup_page_no_limit(prod_url)
|
||||
if soup is None:
|
||||
return
|
||||
|
||||
try:
|
||||
prod_view = soup.find("div", class_="product-view")
|
||||
prod_essential = prod_view.find("div", class_="product-essential")
|
||||
|
||||
image = prod_view.find("img", {'itemprop': 'image'})['src']
|
||||
|
||||
price = prod_essential.find("meta", {'itemprop': 'price'})['content']
|
||||
|
||||
special_price = prod_essential.find("p", class_="special-price")
|
||||
promo = False
|
||||
promo_end = None
|
||||
if special_price is not None:
|
||||
promo = True
|
||||
promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"})
|
||||
if promo_end is not None:
|
||||
promo_end = promo_end['content']
|
||||
|
||||
title = prod_view.find("h1", {'itemprop': 'name'}).text
|
||||
sku = prod_view.find("meta", {'itemprop': 'sku'})['content']
|
||||
brand = prod_view.find("meta", {'itemprop': 'brand'})['content']
|
||||
description = prod_view.find("meta", {'itemprop': 'description'})['content']
|
||||
|
||||
specs = prod_view.find("div", class_="tab-content tab-block-additional")
|
||||
spec_li = specs.find_all("li")
|
||||
info = ''
|
||||
for spec in spec_li:
|
||||
label = spec.find("span", class_="label").text
|
||||
content = spec.find("span", class_="data").text
|
||||
if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1:
|
||||
info = f'{info}{label}: {content} / '
|
||||
info = info[:-3]
|
||||
|
||||
ean_code = ''
|
||||
ean_list = prod_view.find_all("li")
|
||||
for elm in ean_list:
|
||||
if elm is not None:
|
||||
if elm.text.upper().find("EAN") >= 0:
|
||||
ean_code = elm.find("span", class_="data").text
|
||||
|
||||
product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info}
|
||||
price_details = {'price': price, "promo": promo, "promo_end": promo_end}
|
||||
|
||||
common.add_product(kafka_producer, config['fun']['name'], product_details, price_details)
|
||||
except Exception as err:
|
||||
common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc())
|
||||
|
||||
|
||||
maxthreads = 10
|
||||
sema = threading.Semaphore(value=maxthreads)
|
||||
threads = list()
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = common.get_config()
|
||||
|
||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
||||
common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']})
|
||||
common.clear_failed_product(config['fun']['name'])
|
||||
|
||||
all_categories = get_categories(config['fun']['categories_url'])
|
||||
|
||||
process_categories = all_categories.copy()
|
||||
url = next_url(process_categories)
|
||||
products = {}
|
||||
i = 0
|
||||
while url is not None:
|
||||
i = i + 1
|
||||
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
|
||||
cat_url = config["fun"]["url"] + url
|
||||
|
||||
soup = common.get_soup_page_no_limit(cat_url)
|
||||
subcat = soup.find_all("li", class_="item last")
|
||||
|
||||
if len(subcat) == 0:
|
||||
big_urls = soup.find_all("big")
|
||||
for b_url in big_urls:
|
||||
b_href = b_url.find("a")
|
||||
if b_href:
|
||||
try:
|
||||
b_url = b_href['href'].replace(config["fun"]["url"], "")
|
||||
if b_url not in all_categories.values():
|
||||
process_categories[b_href.text] = b_url
|
||||
all_categories[b_href.text] = b_url
|
||||
except Exception as err:
|
||||
print(url)
|
||||
print("+++++++++++++")
|
||||
print(cat_url)
|
||||
print("+++++++++++++")
|
||||
print(b_href)
|
||||
print("+++++++++++++")
|
||||
print(err)
|
||||
print("=============")
|
||||
url = next_url(process_categories)
|
||||
else:
|
||||
for sc in subcat:
|
||||
product = sc.find("h2", class_="product-name")
|
||||
p_info = product.find("a")
|
||||
if p_info['href'] not in products.values():
|
||||
products[p_info['title']] = p_info['href']
|
||||
|
||||
thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
|
||||
next_page = soup.find("a", class_="next i-next")
|
||||
if next_page is None:
|
||||
url = next_url(process_categories)
|
||||
else:
|
||||
url = next_page['href'].replace(config["fun"]["url"], "")
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
common.update_store_prices(producer, config['fun'])
|
||||
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import requests
|
||||
import logging
|
||||
import xml.etree.ElementTree as ET
|
||||
import json
|
||||
import unidecode
|
||||
import sqlite3
|
||||
import common
|
||||
import threading
|
||||
import re
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from kafka import KafkaProducer
|
||||
|
||||
|
||||
def get_product_details(prod_url, config, kafka_producer, json_product):
|
||||
try:
|
||||
soup = common.get_soup_page_no_limit(prod_url)
|
||||
|
||||
description = soup.find('div', class_='pdp-description__content')
|
||||
ean_row = soup.find_all('div', class_='row border-bottom')
|
||||
|
||||
product = {}
|
||||
price = {}
|
||||
product['title'] = json_product['title']
|
||||
product['url'] = prod_url
|
||||
if 'image' in json_product:
|
||||
product['image_url'] = json_product['image']
|
||||
if description.find("p") is not None:
|
||||
product['info'] = description.find("p").text
|
||||
product['sku_code'] = json_product['sku']
|
||||
for row in ean_row:
|
||||
if row.text.find('EAN- / barcode') > -1:
|
||||
ean_pattern = "([0-9]{8,13})"
|
||||
ean_match = re.search(ean_pattern, row.text)
|
||||
product['ean_code'] = ean_match.group(1)
|
||||
if 'brand' in json_product:
|
||||
product['brand'] = json_product['brand']
|
||||
|
||||
price['price'] = json_product['price']
|
||||
if 'discount' in json_product:
|
||||
price['promo'] = 1
|
||||
else:
|
||||
price['promo'] = 0
|
||||
|
||||
common.add_product(kafka_producer, config['hubo']['name'], product, price)
|
||||
except Exception as err:
|
||||
common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc())
|
||||
|
||||
|
||||
def next_url(url_list):
|
||||
if len(url_list) == 0:
|
||||
return None
|
||||
|
||||
key = next(iter(url_list))
|
||||
url = url_list[key]
|
||||
del url_list[key]
|
||||
return url
|
||||
|
||||
|
||||
maxthreads = 5
|
||||
sema = threading.Semaphore(value=maxthreads)
|
||||
threads = list()
|
||||
|
||||
if __name__ == "__main__" :
|
||||
config = common.get_config()
|
||||
|
||||
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
||||
common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']})
|
||||
common.clear_failed_product(config['hubo']['name'])
|
||||
|
||||
get_doc = requests.get(config['hubo']['products_url'])
|
||||
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
|
||||
|
||||
offset = 1
|
||||
|
||||
while len(data_json['docs']) > 0:
|
||||
print(f'{offset}/{data_json["doc_count"]}')
|
||||
for product in data_json['docs']:
|
||||
thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
|
||||
offset = offset + int(data_json['limit'])
|
||||
get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset)))
|
||||
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
common.update_store_prices(producer, config['hubo'])
|
||||
Reference in New Issue
Block a user