You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

166 lines
6.0 KiB

#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import traceback
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
def get_categories(cat_url):
page = requests.get(cat_url)
content = page.content[1:-1].decode('utf-8')
content = content.replace("\\r\\n", "")
content = content.replace('\\/', '/')
content = content.replace('\\"', '"')
content = content.replace(' & ', '&')
root = ET.fromstring(content)
categories = {}
for elm in list(root.iter()):
if elm.tag == "a":
if (elm.attrib['href'] == '#') or (elm.text is None):
continue
if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0):
continue
if elm.attrib['href'] not in categories.values():
categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "")
return categories
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
def get_product_details(prod_url, config, kafka_producer):
soup = common.get_soup_page_no_limit(prod_url)
if soup is None:
return
try:
prod_view = soup.find("div", class_="product-view")
prod_essential = prod_view.find("div", class_="product-essential")
image = prod_view.find("img", {'itemprop': 'image'})['src']
price = prod_essential.find("meta", {'itemprop': 'price'})['content']
special_price = prod_essential.find("p", class_="special-price")
promo = False
promo_end = None
if special_price is not None:
promo = True
promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"})
if promo_end is not None:
promo_end = promo_end['content']
title = prod_view.find("h1", {'itemprop': 'name'}).text
sku = prod_view.find("meta", {'itemprop': 'sku'})['content']
brand = prod_view.find("meta", {'itemprop': 'brand'})['content']
description = prod_view.find("meta", {'itemprop': 'description'})['content']
specs = prod_view.find("div", class_="tab-content tab-block-additional")
spec_li = specs.find_all("li")
info = ''
for spec in spec_li:
label = spec.find("span", class_="label").text
content = spec.find("span", class_="data").text
if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1:
info = f'{info}{label}: {content} / '
info = info[:-3]
ean_code = ''
ean_list = prod_view.find_all("li")
for elm in ean_list:
if elm is not None:
if elm.text.upper().find("EAN") >= 0:
ean_code = elm.find("span", class_="data").text
product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info}
price_details = {'price': price, "promo": promo, "promo_end": promo_end}
common.add_product(kafka_producer, config['fun']['name'], product_details, price_details)
except Exception as err:
common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc())
maxthreads = 10
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__":
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']})
common.clear_failed_product(config['fun']['name'])
all_categories = get_categories(config['fun']['categories_url'])
process_categories = all_categories.copy()
url = next_url(process_categories)
products = {}
i = 0
while url is not None:
i = i + 1
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
cat_url = config["fun"]["url"] + url
soup = common.get_soup_page_no_limit(cat_url)
subcat = soup.find_all("li", class_="item last")
if len(subcat) == 0:
big_urls = soup.find_all("big")
for b_url in big_urls:
b_href = b_url.find("a")
if b_href:
try:
b_url = b_href['href'].replace(config["fun"]["url"], "")
if b_url not in all_categories.values():
process_categories[b_href.text] = b_url
all_categories[b_href.text] = b_url
except Exception as err:
print(url)
print("+++++++++++++")
print(cat_url)
print("+++++++++++++")
print(b_href)
print("+++++++++++++")
print(err)
print("=============")
url = next_url(process_categories)
else:
for sc in subcat:
product = sc.find("h2", class_="product-name")
p_info = product.find("a")
if p_info['href'] not in products.values():
products[p_info['title']] = p_info['href']
thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer))
threads.append(thread)
thread.start()
next_page = soup.find("a", class_="next i-next")
if next_page is None:
url = next_url(process_categories)
else:
url = next_page['href'].replace(config["fun"]["url"], "")
for t in threads:
t.join()
common.update_store_prices(producer, config['fun'])