You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

95 lines
3.1 KiB

#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import re
import traceback
import sys
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
def get_product_details(prod_url, config, kafka_producer, json_product):
try:
soup = common.get_soup_page_no_limit(prod_url)
description = soup.find('div', class_='pdp-description__content')
ean_row = soup.find_all('div', class_='row border-bottom')
product = {}
price = {}
product['title'] = json_product['title']
product['url'] = prod_url
if 'image' in json_product:
product['image_url'] = json_product['image']
if description.find("p") is not None:
product['info'] = description.find("p").text
product['sku_code'] = json_product['sku']
for row in ean_row:
if row.text.find('EAN- / barcode') > -1:
ean_pattern = "([0-9]{8,13})"
ean_match = re.search(ean_pattern, row.text)
product['ean_code'] = ean_match.group(1)
if 'brand' in json_product:
product['brand'] = json_product['brand']
price['price'] = json_product['price']
if 'discount' in json_product:
price['promo'] = 1
else:
price['promo'] = 0
common.add_product(kafka_producer, config['hubo']['name'], product, price)
except Exception as err:
common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc())
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
maxthreads = 5
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__" :
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']})
common.clear_failed_product(config['hubo']['name'])
get_doc = requests.get(config['hubo']['products_url'])
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
offset = 1
while len(data_json['docs']) > 0:
print(f'{offset}/{data_json["doc_count"]}')
for product in data_json['docs']:
thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product))
threads.append(thread)
thread.start()
offset = offset + int(data_json['limit'])
get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset)))
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
for t in threads:
t.join()
common.update_store_prices(producer, config['hubo'])