#!/usr/bin/env python3
|
|
|
|
import requests
|
|
import logging
|
|
import xml.etree.ElementTree as ET
|
|
import json
|
|
import unidecode
|
|
import sqlite3
|
|
import common
|
|
import threading
|
|
import traceback
|
|
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
from kafka import KafkaProducer
|
|
|
|
|
|
def get_categories(cat_url):
|
|
page = requests.get(cat_url)
|
|
content = page.content[1:-1].decode('utf-8')
|
|
content = content.replace("\\r\\n", "")
|
|
content = content.replace('\\/', '/')
|
|
content = content.replace('\\"', '"')
|
|
content = content.replace(' & ', '&')
|
|
root = ET.fromstring(content)
|
|
|
|
categories = {}
|
|
for elm in list(root.iter()):
|
|
if elm.tag == "a":
|
|
if (elm.attrib['href'] == '#') or (elm.text is None):
|
|
continue
|
|
if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0):
|
|
continue
|
|
if elm.attrib['href'] not in categories.values():
|
|
categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "")
|
|
return categories
|
|
|
|
|
|
def next_url(url_list):
|
|
if len(url_list) == 0:
|
|
return None
|
|
|
|
key = next(iter(url_list))
|
|
url = url_list[key]
|
|
del url_list[key]
|
|
return url
|
|
|
|
|
|
def get_product_details(prod_url, config, kafka_producer):
|
|
soup = common.get_soup_page_no_limit(prod_url)
|
|
if soup is None:
|
|
return
|
|
|
|
try:
|
|
prod_view = soup.find("div", class_="product-view")
|
|
prod_essential = prod_view.find("div", class_="product-essential")
|
|
|
|
image = prod_view.find("img", {'itemprop': 'image'})['src']
|
|
|
|
price = prod_essential.find("meta", {'itemprop': 'price'})['content']
|
|
|
|
special_price = prod_essential.find("p", class_="special-price")
|
|
promo = False
|
|
promo_end = None
|
|
if special_price is not None:
|
|
promo = True
|
|
promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"})
|
|
if promo_end is not None:
|
|
promo_end = promo_end['content']
|
|
|
|
title = prod_view.find("h1", {'itemprop': 'name'}).text
|
|
sku = prod_view.find("meta", {'itemprop': 'sku'})['content']
|
|
brand = prod_view.find("meta", {'itemprop': 'brand'})['content']
|
|
description = prod_view.find("meta", {'itemprop': 'description'})['content']
|
|
|
|
specs = prod_view.find("div", class_="tab-content tab-block-additional")
|
|
spec_li = specs.find_all("li")
|
|
info = ''
|
|
for spec in spec_li:
|
|
label = spec.find("span", class_="label").text
|
|
content = spec.find("span", class_="data").text
|
|
if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1:
|
|
info = f'{info}{label}: {content} / '
|
|
info = info[:-3]
|
|
|
|
ean_code = ''
|
|
ean_list = prod_view.find_all("li")
|
|
for elm in ean_list:
|
|
if elm is not None:
|
|
if elm.text.upper().find("EAN") >= 0:
|
|
ean_code = elm.find("span", class_="data").text
|
|
|
|
product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info}
|
|
price_details = {'price': price, "promo": promo, "promo_end": promo_end}
|
|
|
|
common.add_product(kafka_producer, config['fun']['name'], product_details, price_details)
|
|
except Exception as err:
|
|
common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc())
|
|
|
|
|
|
maxthreads = 10
|
|
sema = threading.Semaphore(value=maxthreads)
|
|
threads = list()
|
|
|
|
if __name__ == "__main__":
|
|
config = common.get_config()
|
|
|
|
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
|
common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']})
|
|
common.clear_failed_product(config['fun']['name'])
|
|
|
|
all_categories = get_categories(config['fun']['categories_url'])
|
|
|
|
process_categories = all_categories.copy()
|
|
url = next_url(process_categories)
|
|
products = {}
|
|
i = 0
|
|
while url is not None:
|
|
i = i + 1
|
|
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
|
|
cat_url = config["fun"]["url"] + url
|
|
|
|
soup = common.get_soup_page_no_limit(cat_url)
|
|
subcat = soup.find_all("li", class_="item last")
|
|
|
|
if len(subcat) == 0:
|
|
big_urls = soup.find_all("big")
|
|
for b_url in big_urls:
|
|
b_href = b_url.find("a")
|
|
if b_href:
|
|
try:
|
|
b_url = b_href['href'].replace(config["fun"]["url"], "")
|
|
if b_url not in all_categories.values():
|
|
process_categories[b_href.text] = b_url
|
|
all_categories[b_href.text] = b_url
|
|
except Exception as err:
|
|
print(url)
|
|
print("+++++++++++++")
|
|
print(cat_url)
|
|
print("+++++++++++++")
|
|
print(b_href)
|
|
print("+++++++++++++")
|
|
print(err)
|
|
print("=============")
|
|
url = next_url(process_categories)
|
|
else:
|
|
for sc in subcat:
|
|
product = sc.find("h2", class_="product-name")
|
|
p_info = product.find("a")
|
|
if p_info['href'] not in products.values():
|
|
products[p_info['title']] = p_info['href']
|
|
|
|
thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer))
|
|
threads.append(thread)
|
|
thread.start()
|
|
|
|
next_page = soup.find("a", class_="next i-next")
|
|
if next_page is None:
|
|
url = next_url(process_categories)
|
|
else:
|
|
url = next_page['href'].replace(config["fun"]["url"], "")
|
|
|
|
for t in threads:
|
|
t.join()
|
|
|
|
common.update_store_prices(producer, config['fun'])
|