|
|
- #!/usr/bin/env python3
-
- import requests
- import logging
- import xml.etree.ElementTree as ET
- import json
- import unidecode
- import sqlite3
- import common
- import threading
- import re
- import traceback
- import sys
-
- from bs4 import BeautifulSoup
- from datetime import datetime
- from kafka import KafkaProducer
-
-
- def get_product_details(prod_url, config, kafka_producer, json_product):
- try:
- soup = common.get_soup_page_no_limit(prod_url)
-
- description = soup.find('div', class_='pdp-description__content')
- ean_row = soup.find_all('div', class_='row border-bottom')
-
- product = {}
- price = {}
- product['title'] = json_product['title']
- product['url'] = prod_url
- if 'image' in json_product:
- product['image_url'] = json_product['image']
- if description.find("p") is not None:
- product['info'] = description.find("p").text
- product['sku_code'] = json_product['sku']
- for row in ean_row:
- if row.text.find('EAN- / barcode') > -1:
- ean_pattern = "([0-9]{8,13})"
- ean_match = re.search(ean_pattern, row.text)
- product['ean_code'] = ean_match.group(1)
- if 'brand' in json_product:
- product['brand'] = json_product['brand']
-
- price['price'] = json_product['price']
- if 'discount' in json_product:
- price['promo'] = 1
- else:
- price['promo'] = 0
-
- common.add_product(kafka_producer, config['hubo']['name'], product, price)
- except Exception as err:
- common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc())
-
-
- def next_url(url_list):
- if len(url_list) == 0:
- return None
-
- key = next(iter(url_list))
- url = url_list[key]
- del url_list[key]
- return url
-
-
- maxthreads = 5
- sema = threading.Semaphore(value=maxthreads)
- threads = list()
-
- if __name__ == "__main__" :
- config = common.get_config()
-
- producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
- common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']})
- common.clear_failed_product(config['hubo']['name'])
-
- get_doc = requests.get(config['hubo']['products_url'])
- data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
-
- offset = 1
-
- while len(data_json['docs']) > 0:
- print(f'{offset}/{data_json["doc_count"]}')
- for product in data_json['docs']:
- thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product))
- threads.append(thread)
- thread.start()
-
- offset = offset + int(data_json['limit'])
- get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset)))
- data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
-
- for t in threads:
- t.join()
-
- common.update_store_prices(producer, config['hubo'])
|