#!/usr/bin/env python3
|
|
|
|
import requests
|
|
import logging
|
|
import xml.etree.ElementTree as ET
|
|
import json
|
|
import unidecode
|
|
import sqlite3
|
|
import common
|
|
import threading
|
|
import re
|
|
import traceback
|
|
import sys
|
|
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
from kafka import KafkaProducer
|
|
|
|
|
|
def get_product_details(prod_url, config, kafka_producer, json_product):
|
|
try:
|
|
soup = common.get_soup_page_no_limit(prod_url)
|
|
|
|
description = soup.find('div', class_='pdp-description__content')
|
|
ean_row = soup.find_all('div', class_='row border-bottom')
|
|
|
|
product = {}
|
|
price = {}
|
|
product['title'] = json_product['title']
|
|
product['url'] = prod_url
|
|
if 'image' in json_product:
|
|
product['image_url'] = json_product['image']
|
|
if description.find("p") is not None:
|
|
product['info'] = description.find("p").text
|
|
product['sku_code'] = json_product['sku']
|
|
for row in ean_row:
|
|
if row.text.find('EAN- / barcode') > -1:
|
|
ean_pattern = "([0-9]{8,13})"
|
|
ean_match = re.search(ean_pattern, row.text)
|
|
product['ean_code'] = ean_match.group(1)
|
|
if 'brand' in json_product:
|
|
product['brand'] = json_product['brand']
|
|
|
|
price['price'] = json_product['price']
|
|
if 'discount' in json_product:
|
|
price['promo'] = 1
|
|
else:
|
|
price['promo'] = 0
|
|
|
|
common.add_product(kafka_producer, config['hubo']['name'], product, price)
|
|
except Exception as err:
|
|
common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc())
|
|
|
|
|
|
def next_url(url_list):
|
|
if len(url_list) == 0:
|
|
return None
|
|
|
|
key = next(iter(url_list))
|
|
url = url_list[key]
|
|
del url_list[key]
|
|
return url
|
|
|
|
|
|
maxthreads = 5
|
|
sema = threading.Semaphore(value=maxthreads)
|
|
threads = list()
|
|
|
|
if __name__ == "__main__" :
|
|
config = common.get_config()
|
|
|
|
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
|
common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']})
|
|
common.clear_failed_product(config['hubo']['name'])
|
|
|
|
get_doc = requests.get(config['hubo']['products_url'])
|
|
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
|
|
|
|
offset = 1
|
|
|
|
while len(data_json['docs']) > 0:
|
|
print(f'{offset}/{data_json["doc_count"]}')
|
|
for product in data_json['docs']:
|
|
thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product))
|
|
threads.append(thread)
|
|
thread.start()
|
|
|
|
offset = offset + int(data_json['limit'])
|
|
get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset)))
|
|
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
|
|
|
|
for t in threads:
|
|
t.join()
|
|
|
|
common.update_store_prices(producer, config['hubo'])
|