|
|
- #!/usr/bin/env python3
-
- import requests
- import logging
- import xml.etree.ElementTree as ET
- import json
- import unidecode
- import sqlite3
- import common
- import threading
- import traceback
-
- from bs4 import BeautifulSoup
- from datetime import datetime
- from kafka import KafkaProducer
-
-
- def get_categories(cat_url):
- page = requests.get(cat_url)
- content = page.content[1:-1].decode('utf-8')
- content = content.replace("\\r\\n", "")
- content = content.replace('\\/', '/')
- content = content.replace('\\"', '"')
- content = content.replace(' & ', '&')
- root = ET.fromstring(content)
-
- categories = {}
- for elm in list(root.iter()):
- if elm.tag == "a":
- if (elm.attrib['href'] == '#') or (elm.text is None):
- continue
- if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0):
- continue
- if elm.attrib['href'] not in categories.values():
- categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "")
- return categories
-
-
- def next_url(url_list):
- if len(url_list) == 0:
- return None
-
- key = next(iter(url_list))
- url = url_list[key]
- del url_list[key]
- return url
-
-
- def get_product_details(prod_url, config, kafka_producer):
- soup = common.get_soup_page_no_limit(prod_url)
- if soup is None:
- return
-
- try:
- prod_view = soup.find("div", class_="product-view")
- prod_essential = prod_view.find("div", class_="product-essential")
-
- image = prod_view.find("img", {'itemprop': 'image'})['src']
-
- price = prod_essential.find("meta", {'itemprop': 'price'})['content']
-
- special_price = prod_essential.find("p", class_="special-price")
- promo = False
- promo_end = None
- if special_price is not None:
- promo = True
- promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"})
- if promo_end is not None:
- promo_end = promo_end['content']
-
- title = prod_view.find("h1", {'itemprop': 'name'}).text
- sku = prod_view.find("meta", {'itemprop': 'sku'})['content']
- brand = prod_view.find("meta", {'itemprop': 'brand'})['content']
- description = prod_view.find("meta", {'itemprop': 'description'})['content']
-
- specs = prod_view.find("div", class_="tab-content tab-block-additional")
- spec_li = specs.find_all("li")
- info = ''
- for spec in spec_li:
- label = spec.find("span", class_="label").text
- content = spec.find("span", class_="data").text
- if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1:
- info = f'{info}{label}: {content} / '
- info = info[:-3]
-
- ean_code = ''
- ean_list = prod_view.find_all("li")
- for elm in ean_list:
- if elm is not None:
- if elm.text.upper().find("EAN") >= 0:
- ean_code = elm.find("span", class_="data").text
-
- product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info}
- price_details = {'price': price, "promo": promo, "promo_end": promo_end}
-
- common.add_product(kafka_producer, config['fun']['name'], product_details, price_details)
- except Exception as err:
- common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc())
-
-
- maxthreads = 10
- sema = threading.Semaphore(value=maxthreads)
- threads = list()
-
- if __name__ == "__main__":
- config = common.get_config()
-
- producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
- common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']})
- common.clear_failed_product(config['fun']['name'])
-
- all_categories = get_categories(config['fun']['categories_url'])
-
- process_categories = all_categories.copy()
- url = next_url(process_categories)
- products = {}
- i = 0
- while url is not None:
- i = i + 1
- print(f'{i}/{len(all_categories)} - {len(process_categories)}')
- cat_url = config["fun"]["url"] + url
-
- soup = common.get_soup_page_no_limit(cat_url)
- subcat = soup.find_all("li", class_="item last")
-
- if len(subcat) == 0:
- big_urls = soup.find_all("big")
- for b_url in big_urls:
- b_href = b_url.find("a")
- if b_href:
- try:
- b_url = b_href['href'].replace(config["fun"]["url"], "")
- if b_url not in all_categories.values():
- process_categories[b_href.text] = b_url
- all_categories[b_href.text] = b_url
- except Exception as err:
- print(url)
- print("+++++++++++++")
- print(cat_url)
- print("+++++++++++++")
- print(b_href)
- print("+++++++++++++")
- print(err)
- print("=============")
- url = next_url(process_categories)
- else:
- for sc in subcat:
- product = sc.find("h2", class_="product-name")
- p_info = product.find("a")
- if p_info['href'] not in products.values():
- products[p_info['title']] = p_info['href']
-
- thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer))
- threads.append(thread)
- thread.start()
-
- next_page = soup.find("a", class_="next i-next")
- if next_page is None:
- url = next_url(process_categories)
- else:
- url = next_page['href'].replace(config["fun"]["url"], "")
-
- for t in threads:
- t.join()
-
- common.update_store_prices(producer, config['fun'])
|