You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

158 lines
5.9 KiB

#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import re
import traceback
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
def get_categories(url, cookie):
soup = common.get_soup_page_no_limit(url, cookie)
categories = {}
dropdown = soup.find('div', class_="mxd-mega-dropdown")
anchors = dropdown.find_all('a')
for anchor in anchors:
if anchor['href'].find('bricosolar') >= 0:
print(f'remove {anchor["href"]}')
pass
elif anchor['href'] not in categories.values():
if anchor['href'].find('https://www.brico.be') == -1:
anchor['href'] = f'https://www.brico.be{anchor["href"]}'
categories[anchor.text] = anchor['href']
return categories
def get_product_details(prod_url, config, kafka_producer, failed):
try:
soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie'])
prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout')
if prod_detail is None:
return
prod_specs = soup.find('div', {'id': 'specs'})
title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1')
product = {}
price = {}
product['title'] = title.find("span", {'itemprop': 'name'}).text
product['url'] = prod_url
product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src']
product['info'] = prod_specs.find_all("p")[0].text
product['sku_code'] = prod_url.split("/")[-1]
ean_pattern = "([0-9]{8,13})"
if len(prod_specs.find_all("p")) > 1:
ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text)
product['ean_code'] = ean_match.group(1)
pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)'
pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"'
scripts = soup.find_all("script")
for script in scripts:
if script.string is not None:
if script.string.find('"brands"') >= 0:
match = re.search(pattern_brand, script.string)
product['brand'] = match.group(1)
if script.string.find('"ean"') >= 0 and 'ean_code' not in product:
match = re.search(pattern_ean, script.string)
product['ean_code'] = match.group(1)
if soup.find("ins") is None:
return
price['price'] = soup.find("ins").find("meta")['content']
price['promo'] = 0 if soup.find("del") is None else 1
common.add_product(kafka_producer, config['brico']['name'], product, price)
except Exception as err:
common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc())
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
maxthreads = 10
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__" :
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']})
common.clear_failed_product(config['brico']['name'])
all_categories = get_categories(config['brico']['url'], config['brico']['cookie'])
process_categories = all_categories.copy()
url = next_url(process_categories)
products = {}
failed_prod = {}
i = 0
while url is not None:
i = i + 1
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
soup = common.get_soup_page_no_limit(url, config['brico']['cookie'])
page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive")
extra_cat = soup.find_all("a", class_="mxd-product-list-link")
if len(extra_cat) > 0:
for cat in extra_cat:
if cat['href'].find('bricosolar') >= 0:
print(f'main remove {cat["href"]}')
pass
elif cat['href'] not in all_categories.values():
cat_name = cat.find("span").text
if cat['href'].find("/") > 0:
cat['href'] = f'/{cat["href"]}'
process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}'
#print(f'added {config["brico"]["url"]}{cat["href"]}')
all_categories[cat_name] = cat['href']
for product in page_products:
product = product.find("a", class_="mxd-block-card-link")
if product['href'].find("/nl") == -1:
product['href'] = f'/nl{product["href"]}'
if product['href'].find("/") > 0:
product['href'] = f'/{product["href"]}'
if product['href'] not in products.values():
prod_name = product.find("span", {"itemprop": "name"}).text
#print(f'product url: {product["href"]}')
products[prod_name] = product["href"]
thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod))
threads.append(thread)
thread.start()
next_page = soup.find("a", {"rel": "next"})
if next_page is None:
url = next_url(process_categories)
else:
base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"})
url = f'{base["href"]}{next_page["href"]}'
print(f'next page: {url}')
for t in threads:
t.join()
common.update_store_prices(producer, config['brico'])