#!/usr/bin/env python3
|
|
|
|
import requests
|
|
import logging
|
|
import xml.etree.ElementTree as ET
|
|
import json
|
|
import unidecode
|
|
import sqlite3
|
|
import common
|
|
import threading
|
|
import re
|
|
import traceback
|
|
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
from kafka import KafkaProducer
|
|
|
|
|
|
def get_categories(url, cookie):
|
|
soup = common.get_soup_page_no_limit(url, cookie)
|
|
|
|
categories = {}
|
|
|
|
dropdown = soup.find('div', class_="mxd-mega-dropdown")
|
|
|
|
anchors = dropdown.find_all('a')
|
|
for anchor in anchors:
|
|
if anchor['href'].find('bricosolar') >= 0:
|
|
print(f'remove {anchor["href"]}')
|
|
pass
|
|
elif anchor['href'] not in categories.values():
|
|
if anchor['href'].find('https://www.brico.be') == -1:
|
|
anchor['href'] = f'https://www.brico.be{anchor["href"]}'
|
|
categories[anchor.text] = anchor['href']
|
|
|
|
return categories
|
|
|
|
|
|
def get_product_details(prod_url, config, kafka_producer, failed):
|
|
try:
|
|
soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie'])
|
|
|
|
prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout')
|
|
if prod_detail is None:
|
|
return
|
|
prod_specs = soup.find('div', {'id': 'specs'})
|
|
title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1')
|
|
|
|
product = {}
|
|
price = {}
|
|
product['title'] = title.find("span", {'itemprop': 'name'}).text
|
|
product['url'] = prod_url
|
|
product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src']
|
|
product['info'] = prod_specs.find_all("p")[0].text
|
|
product['sku_code'] = prod_url.split("/")[-1]
|
|
ean_pattern = "([0-9]{8,13})"
|
|
if len(prod_specs.find_all("p")) > 1:
|
|
ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text)
|
|
product['ean_code'] = ean_match.group(1)
|
|
|
|
pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)'
|
|
pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"'
|
|
scripts = soup.find_all("script")
|
|
for script in scripts:
|
|
if script.string is not None:
|
|
if script.string.find('"brands"') >= 0:
|
|
match = re.search(pattern_brand, script.string)
|
|
product['brand'] = match.group(1)
|
|
if script.string.find('"ean"') >= 0 and 'ean_code' not in product:
|
|
match = re.search(pattern_ean, script.string)
|
|
product['ean_code'] = match.group(1)
|
|
if soup.find("ins") is None:
|
|
return
|
|
price['price'] = soup.find("ins").find("meta")['content']
|
|
price['promo'] = 0 if soup.find("del") is None else 1
|
|
|
|
common.add_product(kafka_producer, config['brico']['name'], product, price)
|
|
except Exception as err:
|
|
common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc())
|
|
|
|
|
|
def next_url(url_list):
|
|
if len(url_list) == 0:
|
|
return None
|
|
|
|
key = next(iter(url_list))
|
|
url = url_list[key]
|
|
del url_list[key]
|
|
return url
|
|
|
|
|
|
maxthreads = 10
|
|
sema = threading.Semaphore(value=maxthreads)
|
|
threads = list()
|
|
|
|
if __name__ == "__main__" :
|
|
config = common.get_config()
|
|
|
|
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
|
common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']})
|
|
common.clear_failed_product(config['brico']['name'])
|
|
|
|
all_categories = get_categories(config['brico']['url'], config['brico']['cookie'])
|
|
|
|
process_categories = all_categories.copy()
|
|
url = next_url(process_categories)
|
|
products = {}
|
|
failed_prod = {}
|
|
i = 0
|
|
while url is not None:
|
|
i = i + 1
|
|
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
|
|
|
|
soup = common.get_soup_page_no_limit(url, config['brico']['cookie'])
|
|
page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive")
|
|
extra_cat = soup.find_all("a", class_="mxd-product-list-link")
|
|
|
|
if len(extra_cat) > 0:
|
|
for cat in extra_cat:
|
|
if cat['href'].find('bricosolar') >= 0:
|
|
print(f'main remove {cat["href"]}')
|
|
pass
|
|
elif cat['href'] not in all_categories.values():
|
|
cat_name = cat.find("span").text
|
|
if cat['href'].find("/") > 0:
|
|
cat['href'] = f'/{cat["href"]}'
|
|
process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}'
|
|
#print(f'added {config["brico"]["url"]}{cat["href"]}')
|
|
all_categories[cat_name] = cat['href']
|
|
|
|
for product in page_products:
|
|
product = product.find("a", class_="mxd-block-card-link")
|
|
if product['href'].find("/nl") == -1:
|
|
product['href'] = f'/nl{product["href"]}'
|
|
if product['href'].find("/") > 0:
|
|
product['href'] = f'/{product["href"]}'
|
|
if product['href'] not in products.values():
|
|
prod_name = product.find("span", {"itemprop": "name"}).text
|
|
#print(f'product url: {product["href"]}')
|
|
products[prod_name] = product["href"]
|
|
|
|
thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod))
|
|
threads.append(thread)
|
|
thread.start()
|
|
|
|
next_page = soup.find("a", {"rel": "next"})
|
|
if next_page is None:
|
|
url = next_url(process_categories)
|
|
else:
|
|
base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"})
|
|
url = f'{base["href"]}{next_page["href"]}'
|
|
print(f'next page: {url}')
|
|
|
|
|
|
for t in threads:
|
|
t.join()
|
|
|
|
common.update_store_prices(producer, config['brico'])
|