|
|
- #!/usr/bin/env python3
-
- import requests
- import logging
- import xml.etree.ElementTree as ET
- import json
- import unidecode
- import sqlite3
- import common
- import threading
- import re
- import traceback
- import sys
- import itertools
- import logging
- #logging.basicConfig(level=logging.DEBUG)
-
- from time import sleep
- from random import randint
- from bs4 import BeautifulSoup
- from datetime import datetime
- from kafka import KafkaProducer
- from ratelimit import limits, sleep_and_retry
-
-
-
- def get_categories(url):
- soup = common.get_soup_page(url)
-
- categories = {}
-
- articles = soup.find_all('article', class_="item bg-gray")
-
- for article in articles:
- anchor = article.find('a')
- anchor_title = anchor['title'].split(' | ')[0]
- categories[anchor_title] = anchor['href']
-
- return categories
-
-
- def get_product_details(prod_url, config, kafka_producer, anchor_title):
- try:
- soup = common.get_soup_page(prod_url)
- if soup is None:
- common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
- return
-
- prod_detail = soup.find('article')
- if prod_detail is None:
- common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
- return
- title = prod_detail.find('h1', {'itemprop': 'name'})
- image = prod_detail.find('a', class_='image-click').find('img')
- description = prod_detail.find('div', {'itemprop': 'description'})
- ean_code = prod_detail.find('td', {'itemprop': 'eancode'})
- sku = prod_detail.find('span', {'itemprop': 'sku'})
- brand = prod_detail.find('span', {'itemprop': 'brand'})
- price_detail = prod_detail.find('span', {'itemprop': 'price'})
-
- product = {}
- price = {}
- if title is not None:
- product['title'] = title.text.strip()
- product['url'] = prod_url
- product['image_url'] = image['src']
- if description is not None:
- product['info'] = description.text.strip()
- if sku is not None:
- product['sku_code'] = sku.text.strip()
- if ean_code is not None:
- product['ean_code'] = ean_code.text.strip()
- if brand is not None:
- product['brand'] = brand.text.strip()
-
- price['price'] = price_detail.text.split()[1]
- price['promo'] = 0 if soup.find("del") is None else 1
-
- common.add_product(kafka_producer, config['brouwland']['name'], product, price)
- except Exception as err:
- print(traceback.format_exc())
- common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc())
-
-
- def next_url(url_list):
- if len(url_list) == 0:
- return None
-
- key = next(iter(url_list))
- url = url_list[key]
- del url_list[key]
- return url
-
-
- maxthreads = 5
- sema = threading.Semaphore(value=maxthreads)
- threads = list()
-
- if __name__ == "__main__" :
- config = common.get_config()
-
- producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
- common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']})
- common.clear_failed_product(config['brouwland']['name'])
-
- all_categories = get_categories(config['brouwland']['catalogue_url'])
-
- process_categories = all_categories.copy()
- url = next_url(process_categories)
- products = {}
- i = 0
- while url is not None:
- i = i + 1
- print(f'{i}/{len(all_categories)} - {len(process_categories)}')
- url = f'{config["brouwland"]["url"]}{url}'
- soup = common.get_soup_page(url)
- if soup is None:
- url = next_url(process_categories)
- continue
- page_products = soup.find_all("article", class_="product")
- extra_cat = soup.find_all("article", class_="item bg-gray")
-
- if len(extra_cat) > 0:
- for cat in extra_cat:
- anchor = cat.find('a')
- if anchor['href'] not in all_categories.values():
- anchor_title = anchor['title'].split(' | ')[0]
- process_categories[anchor_title] = f'{anchor["href"]}'
- all_categories[anchor_title] = anchor['href']
- #print(f'added {cat["data-href"]}')
-
- for product in page_products:
- anchor = product.find("a")
- if anchor['href'] not in products.values():
- anchor_title = anchor['title'].split(' | ')[0]
- if anchor_title.upper().find("CADEAU") > -1:
- continue
- products[anchor_title] = anchor["href"]
-
- thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title))
- threads.append(thread)
- thread.start()
-
- next_page = soup.find("a", class_="next")
- if next_page is None:
- url = next_url(process_categories)
- else:
- url = next_page["href"]
-
- for t in threads:
- t.join()
-
- common.update_store_prices(producer, config['brouwland'])
|