|
|
- #!/usr/bin/env python3
-
- import requests
- import logging
- import xml.etree.ElementTree as ET
- import json
- import unidecode
- import sqlite3
- import common
- import threading
- import re
- import traceback
-
- from bs4 import BeautifulSoup
- from datetime import datetime
- from kafka import KafkaProducer
-
-
- def get_categories(url, cookie):
- soup = common.get_soup_page_no_limit(url, cookie)
-
- categories = {}
-
- dropdown = soup.find('div', class_="mxd-mega-dropdown")
-
- anchors = dropdown.find_all('a')
- for anchor in anchors:
- if anchor['href'].find('bricosolar') >= 0:
- print(f'remove {anchor["href"]}')
- pass
- elif anchor['href'] not in categories.values():
- if anchor['href'].find('https://www.brico.be') == -1:
- anchor['href'] = f'https://www.brico.be{anchor["href"]}'
- categories[anchor.text] = anchor['href']
-
- return categories
-
-
- def get_product_details(prod_url, config, kafka_producer, failed):
- try:
- soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie'])
-
- prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout')
- if prod_detail is None:
- return
- prod_specs = soup.find('div', {'id': 'specs'})
- title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1')
-
- product = {}
- price = {}
- product['title'] = title.find("span", {'itemprop': 'name'}).text
- product['url'] = prod_url
- product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src']
- product['info'] = prod_specs.find_all("p")[0].text
- product['sku_code'] = prod_url.split("/")[-1]
- ean_pattern = "([0-9]{8,13})"
- if len(prod_specs.find_all("p")) > 1:
- ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text)
- product['ean_code'] = ean_match.group(1)
-
- pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)'
- pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"'
- scripts = soup.find_all("script")
- for script in scripts:
- if script.string is not None:
- if script.string.find('"brands"') >= 0:
- match = re.search(pattern_brand, script.string)
- product['brand'] = match.group(1)
- if script.string.find('"ean"') >= 0 and 'ean_code' not in product:
- match = re.search(pattern_ean, script.string)
- product['ean_code'] = match.group(1)
- if soup.find("ins") is None:
- return
- price['price'] = soup.find("ins").find("meta")['content']
- price['promo'] = 0 if soup.find("del") is None else 1
-
- common.add_product(kafka_producer, config['brico']['name'], product, price)
- except Exception as err:
- common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc())
-
-
- def next_url(url_list):
- if len(url_list) == 0:
- return None
-
- key = next(iter(url_list))
- url = url_list[key]
- del url_list[key]
- return url
-
-
- maxthreads = 10
- sema = threading.Semaphore(value=maxthreads)
- threads = list()
-
- if __name__ == "__main__" :
- config = common.get_config()
-
- producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
- common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']})
- common.clear_failed_product(config['brico']['name'])
-
- all_categories = get_categories(config['brico']['url'], config['brico']['cookie'])
-
- process_categories = all_categories.copy()
- url = next_url(process_categories)
- products = {}
- failed_prod = {}
- i = 0
- while url is not None:
- i = i + 1
- print(f'{i}/{len(all_categories)} - {len(process_categories)}')
-
- soup = common.get_soup_page_no_limit(url, config['brico']['cookie'])
- page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive")
- extra_cat = soup.find_all("a", class_="mxd-product-list-link")
-
- if len(extra_cat) > 0:
- for cat in extra_cat:
- if cat['href'].find('bricosolar') >= 0:
- print(f'main remove {cat["href"]}')
- pass
- elif cat['href'] not in all_categories.values():
- cat_name = cat.find("span").text
- if cat['href'].find("/") > 0:
- cat['href'] = f'/{cat["href"]}'
- process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}'
- #print(f'added {config["brico"]["url"]}{cat["href"]}')
- all_categories[cat_name] = cat['href']
-
- for product in page_products:
- product = product.find("a", class_="mxd-block-card-link")
- if product['href'].find("/nl") == -1:
- product['href'] = f'/nl{product["href"]}'
- if product['href'].find("/") > 0:
- product['href'] = f'/{product["href"]}'
- if product['href'] not in products.values():
- prod_name = product.find("span", {"itemprop": "name"}).text
- #print(f'product url: {product["href"]}')
- products[prod_name] = product["href"]
-
- thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod))
- threads.append(thread)
- thread.start()
-
- next_page = soup.find("a", {"rel": "next"})
- if next_page is None:
- url = next_url(process_categories)
- else:
- base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"})
- url = f'{base["href"]}{next_page["href"]}'
- print(f'next page: {url}')
-
-
- for t in threads:
- t.join()
-
- common.update_store_prices(producer, config['brico'])
|