#!/usr/bin/env python3
|
|
|
|
import requests
|
|
import logging
|
|
import xml.etree.ElementTree as ET
|
|
import json
|
|
import unidecode
|
|
import sqlite3
|
|
import common
|
|
import threading
|
|
import re
|
|
import traceback
|
|
import sys
|
|
import itertools
|
|
import logging
|
|
#logging.basicConfig(level=logging.DEBUG)
|
|
|
|
from time import sleep
|
|
from random import randint
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
from kafka import KafkaProducer
|
|
from ratelimit import limits, sleep_and_retry
|
|
|
|
|
|
|
|
def get_categories(url):
|
|
soup = common.get_soup_page(url)
|
|
|
|
categories = {}
|
|
|
|
articles = soup.find_all('article', class_="item bg-gray")
|
|
|
|
for article in articles:
|
|
anchor = article.find('a')
|
|
anchor_title = anchor['title'].split(' | ')[0]
|
|
categories[anchor_title] = anchor['href']
|
|
|
|
return categories
|
|
|
|
|
|
def get_product_details(prod_url, config, kafka_producer, anchor_title):
|
|
try:
|
|
soup = common.get_soup_page(prod_url)
|
|
if soup is None:
|
|
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
|
|
return
|
|
|
|
prod_detail = soup.find('article')
|
|
if prod_detail is None:
|
|
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
|
|
return
|
|
title = prod_detail.find('h1', {'itemprop': 'name'})
|
|
image = prod_detail.find('a', class_='image-click').find('img')
|
|
description = prod_detail.find('div', {'itemprop': 'description'})
|
|
ean_code = prod_detail.find('td', {'itemprop': 'eancode'})
|
|
sku = prod_detail.find('span', {'itemprop': 'sku'})
|
|
brand = prod_detail.find('span', {'itemprop': 'brand'})
|
|
price_detail = prod_detail.find('span', {'itemprop': 'price'})
|
|
|
|
product = {}
|
|
price = {}
|
|
if title is not None:
|
|
product['title'] = title.text.strip()
|
|
product['url'] = prod_url
|
|
product['image_url'] = image['src']
|
|
if description is not None:
|
|
product['info'] = description.text.strip()
|
|
if sku is not None:
|
|
product['sku_code'] = sku.text.strip()
|
|
if ean_code is not None:
|
|
product['ean_code'] = ean_code.text.strip()
|
|
if brand is not None:
|
|
product['brand'] = brand.text.strip()
|
|
|
|
price['price'] = price_detail.text.split()[1]
|
|
price['promo'] = 0 if soup.find("del") is None else 1
|
|
|
|
common.add_product(kafka_producer, config['brouwland']['name'], product, price)
|
|
except Exception as err:
|
|
print(traceback.format_exc())
|
|
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc())
|
|
|
|
|
|
def next_url(url_list):
|
|
if len(url_list) == 0:
|
|
return None
|
|
|
|
key = next(iter(url_list))
|
|
url = url_list[key]
|
|
del url_list[key]
|
|
return url
|
|
|
|
|
|
maxthreads = 5
|
|
sema = threading.Semaphore(value=maxthreads)
|
|
threads = list()
|
|
|
|
if __name__ == "__main__" :
|
|
config = common.get_config()
|
|
|
|
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
|
common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']})
|
|
common.clear_failed_product(config['brouwland']['name'])
|
|
|
|
all_categories = get_categories(config['brouwland']['catalogue_url'])
|
|
|
|
process_categories = all_categories.copy()
|
|
url = next_url(process_categories)
|
|
products = {}
|
|
i = 0
|
|
while url is not None:
|
|
i = i + 1
|
|
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
|
|
url = f'{config["brouwland"]["url"]}{url}'
|
|
soup = common.get_soup_page(url)
|
|
if soup is None:
|
|
url = next_url(process_categories)
|
|
continue
|
|
page_products = soup.find_all("article", class_="product")
|
|
extra_cat = soup.find_all("article", class_="item bg-gray")
|
|
|
|
if len(extra_cat) > 0:
|
|
for cat in extra_cat:
|
|
anchor = cat.find('a')
|
|
if anchor['href'] not in all_categories.values():
|
|
anchor_title = anchor['title'].split(' | ')[0]
|
|
process_categories[anchor_title] = f'{anchor["href"]}'
|
|
all_categories[anchor_title] = anchor['href']
|
|
#print(f'added {cat["data-href"]}')
|
|
|
|
for product in page_products:
|
|
anchor = product.find("a")
|
|
if anchor['href'] not in products.values():
|
|
anchor_title = anchor['title'].split(' | ')[0]
|
|
if anchor_title.upper().find("CADEAU") > -1:
|
|
continue
|
|
products[anchor_title] = anchor["href"]
|
|
|
|
thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title))
|
|
threads.append(thread)
|
|
thread.start()
|
|
|
|
next_page = soup.find("a", class_="next")
|
|
if next_page is None:
|
|
url = next_url(process_categories)
|
|
else:
|
|
url = next_page["href"]
|
|
|
|
for t in threads:
|
|
t.join()
|
|
|
|
common.update_store_prices(producer, config['brouwland'])
|