You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

153 lines
5.1 KiB

#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import re
import traceback
import sys
import itertools
import logging
#logging.basicConfig(level=logging.DEBUG)
from time import sleep
from random import randint
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
from ratelimit import limits, sleep_and_retry
def get_categories(url):
soup = common.get_soup_page(url)
categories = {}
articles = soup.find_all('article', class_="item bg-gray")
for article in articles:
anchor = article.find('a')
anchor_title = anchor['title'].split(' | ')[0]
categories[anchor_title] = anchor['href']
return categories
def get_product_details(prod_url, config, kafka_producer, anchor_title):
try:
soup = common.get_soup_page(prod_url)
if soup is None:
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
return
prod_detail = soup.find('article')
if prod_detail is None:
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
return
title = prod_detail.find('h1', {'itemprop': 'name'})
image = prod_detail.find('a', class_='image-click').find('img')
description = prod_detail.find('div', {'itemprop': 'description'})
ean_code = prod_detail.find('td', {'itemprop': 'eancode'})
sku = prod_detail.find('span', {'itemprop': 'sku'})
brand = prod_detail.find('span', {'itemprop': 'brand'})
price_detail = prod_detail.find('span', {'itemprop': 'price'})
product = {}
price = {}
if title is not None:
product['title'] = title.text.strip()
product['url'] = prod_url
product['image_url'] = image['src']
if description is not None:
product['info'] = description.text.strip()
if sku is not None:
product['sku_code'] = sku.text.strip()
if ean_code is not None:
product['ean_code'] = ean_code.text.strip()
if brand is not None:
product['brand'] = brand.text.strip()
price['price'] = price_detail.text.split()[1]
price['promo'] = 0 if soup.find("del") is None else 1
common.add_product(kafka_producer, config['brouwland']['name'], product, price)
except Exception as err:
print(traceback.format_exc())
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc())
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
maxthreads = 5
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__" :
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']})
common.clear_failed_product(config['brouwland']['name'])
all_categories = get_categories(config['brouwland']['catalogue_url'])
process_categories = all_categories.copy()
url = next_url(process_categories)
products = {}
i = 0
while url is not None:
i = i + 1
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
url = f'{config["brouwland"]["url"]}{url}'
soup = common.get_soup_page(url)
if soup is None:
url = next_url(process_categories)
continue
page_products = soup.find_all("article", class_="product")
extra_cat = soup.find_all("article", class_="item bg-gray")
if len(extra_cat) > 0:
for cat in extra_cat:
anchor = cat.find('a')
if anchor['href'] not in all_categories.values():
anchor_title = anchor['title'].split(' | ')[0]
process_categories[anchor_title] = f'{anchor["href"]}'
all_categories[anchor_title] = anchor['href']
#print(f'added {cat["data-href"]}')
for product in page_products:
anchor = product.find("a")
if anchor['href'] not in products.values():
anchor_title = anchor['title'].split(' | ')[0]
if anchor_title.upper().find("CADEAU") > -1:
continue
products[anchor_title] = anchor["href"]
thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title))
threads.append(thread)
thread.start()
next_page = soup.find("a", class_="next")
if next_page is None:
url = next_url(process_categories)
else:
url = next_page["href"]
for t in threads:
t.join()
common.update_store_prices(producer, config['brouwland'])