Browse Source

first commit

master
Steven Kuterna 3 years ago
commit
58a04a1798
11 changed files with 1234 additions and 0 deletions
  1. +0
    -0
      README.md
  2. BIN
      cheap_shopper.db
  3. +54
    -0
      cheap_shopper.yaml
  4. +152
    -0
      common.py
  5. +187
    -0
      db_consumer.py
  6. +89
    -0
      parser_action.py
  7. +158
    -0
      parser_brico.py
  8. +153
    -0
      parser_brouwland.py
  9. +180
    -0
      parser_dreamland.py
  10. +166
    -0
      parser_fun.py
  11. +95
    -0
      parser_hubo.py

+ 0
- 0
README.md View File


BIN
cheap_shopper.db View File


+ 54
- 0
cheap_shopper.yaml View File

@ -0,0 +1,54 @@
config:
loglevel: logging.DEBUG
log_sql: False
sqlitedb: cheap_shopper.db
active_stores: action, fun, brico
kafka_boostrap_servers: apollo.home.lan
action:
name: Action
url: https://www.action.com
logo: https://upload.wikimedia.org/wikipedia/commons/4/45/Action_Nederland_Logo_2020.svg
categories_url: https://www.action.com/api/navigation/categories?language=nl-BE
products_url: https://www.action.com/api/subcategories/products/?routeSegment=__url__&offset=__offset__&limit=__limit__
products_cookie: {'epslanguage': 'nl-BE'}
brico:
name: Brico
url: https://www.brico.be
logo: https://vdm.bricocdn.be/logos/brico.svg?ie
cookie: {'language': 'nl'}
brouwland:
name: Brouwland
url: https://www.brouwland.com
logo: https://brouwlandprod-yappa.netdna-ssl.com/build/images/logo-brouwland2.2fc01423.png
catalogue_url: https://www.brouwland.com/nl/onze-producten
dreamland:
name: Dreamland
url: https://www.dreamland.be/e/nl/dl
logo: https://seeklogo.com/images/D/dreamland-be-logo-7F2C0508F2-seeklogo.com.png
delhaize:
name: Delhaize
url: https://delhaize.be/shop
logo:
fun:
name: Fun
url: https://www.fun.be
logo: https://e7.pngegg.com/pngimages/935/634/png-clipart-fun-logo-fun-toyshop-logo-icons-logos-emojis-toy-shop-logos.png
categories_url: https://www.fun.be/seall/menu/index
gamma:
name: Gamma
url: https://www.gamma.be
logo: https://nl.wikipedia.org/wiki/Gamma_(winkel)#/media/Bestand:Gamma_logo_2010.png
categories_url: https://www.gamma.be/nl/resources/menu/categories
hubo:
name: Hubo
url: https://www.hubo.be
logo: https://www.hubo.be/content/dam/hubo/site-afbeeldingen/hubo-algemeen/logo.svg
categories_url: https://www.hubo.be/nl/a.html
products_url: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&limit=60
products_url_offset: https://sd.searchnode.net/v1/query/docs?query_key=CzVuTtVooUCa8uwwNA6tA9X6lMXVLq01&sort=-score&filters.categoriesMeta=assortiment&offset=__OFFSET__
intratuin_nl:
name: Intratuin NL
url: https://www.intratuin.nl
intratuin_be:
name: Intratuin BE
url: https://www.intratuin.be

+ 152
- 0
common.py View File

@ -0,0 +1,152 @@
import yaml
import json
import os
import shutil
import traceback
import requests
import re
from bs4 import BeautifulSoup
from ratelimit import limits, sleep_and_retry
from time import sleep
from random import randint
from datetime import datetime
def get_config():
with open('cheap_shopper.yaml', 'r') as ymlfile:
cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
return cfg
def add_product(kafka_producer, store, product, price):
db_object = {}
db_object['type'] = 'product'
db_object['store'] = store
db_object['product'] = product
db_object['price'] = price
db_object_json = json.dumps(db_object)
db_object_bytes = bytearray(db_object_json, 'utf-8')
send_kafka_message(kafka_producer, db_object_bytes)
def insert_update_store(kafka_producer, store):
db_object = {}
db_object['type'] = 'store'
db_object['store'] = store
db_object_json = json.dumps(db_object)
db_object_bytes = bytearray(db_object_json, 'utf-8')
send_kafka_message(kafka_producer, db_object_bytes)
def send_kafka_message(kafka_producer, message):
kafka_producer.send('shopper_db', message)
def update_store_prices(kafka_producer, config):
insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')})
db_object = {}
db_object['type'] = 'store_update'
db_object['store'] = config['name']
db_object_json = json.dumps(db_object)
db_object_bytes = bytearray(db_object_json, 'utf-8')
send_kafka_message(kafka_producer, db_object_bytes)
print(f'updating prices {db_object_json}')
def clear_failed_product(store):
if not os.path.exists('failed'):
os.mkdir('failed')
if not os.path.exists(f'failed/{store}'):
os.mkdir(f'failed/{store}')
return
else:
for dir in os.listdir(f'failed/{store}/'):
shutil.rmtree(f'failed/{store}/{dir}')
def dump_failed_product(store, prod_url, page, err, trace):
if not os.path.exists('failed'):
os.mkdir('failed')
if not os.path.exists(f'failed/{store}'):
os.mkdir(f'failed/{store}')
dirname = prod_url.replace('https://','')
dirname = dirname.replace('.','-')
dirname = dirname.replace('/','_')
os.mkdir(f'failed/{store}/{dirname}')
err_file = open(f'failed/{store}/{dirname}/error.txt', "w")
err_file.write(f'{prod_url}\n')
err_file.write('===========================================\n')
err_file.write(f'{str(err)}\n')
err_file.write('===========================================\n')
err_file.write(str(trace))
err_file.close()
page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8")
page_file.write(str(page))
page_file.close()
def get_proxies():
page = requests.get("https://free-proxy-list.net/")
soup = BeautifulSoup(page.content, "html.parser")
raw_div = soup.find("div", {"id": "raw"})
raw_textarea = raw_div.find("textarea")
ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text)
ip_dict = {}
for ip in ip_list:
ip_addr = ip.split(":")[0]
ip_port = ip.split(":")[1]
ip_dict[ip_addr] = ip_port
return ip_list
@sleep_and_retry
@limits(calls=2, period=1)
def get_soup_page(url, cookie=None):
#print(f'get page for soup: {url}')
sleep(randint(1,2))
soup = None
try:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'}
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
page = requests.get(url, cookies=cookie, headers=headers)
if page.status_code != 200 and page.status_code != 301:
return None
soup = BeautifulSoup(page.content, "html.parser")
except ConnectionRefusedError:
print(traceback.format_exc())
sleep(randint(2,3))
soup = get_soup_page(url)
except Exception as err:
print(traceback.format_exc())
print(err)
return soup
def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}):
#print(f'get page for soup: {url}')
sleep(randint(1,2))
soup = None
try:
if len(headers) == 0:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
'content-type':'text/html;charset=UTF-8'}
if 'user-agent' not in headers:
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
page = requests.get(url, cookies=cookie, data=payload, headers=headers)
#print(page.content)
if page.status_code != 200 and page.status_code != 301:
return None
soup = BeautifulSoup(page.content, "html.parser")
except Exception as err:
print(traceback.format_exc())
print(err)
return soup

+ 187
- 0
db_consumer.py View File

@ -0,0 +1,187 @@
#!/usr/bin/env python3
import sqlite3
import common
import json
import time
import threading
from datetime import datetime
from kafka import KafkaConsumer
def committer(connection):
while True:
connection.commit()
time.sleep(10)
def insert_update_store(connection, stores, value_json):
cursor = connection.cursor()
columns = '('
values_sql = f'('
values = []
update_sql = ''
update_values = []
for dict_key in value_json['store']:
columns = f'{columns}{dict_key}, '
values.append(value_json['store'][dict_key])
values_sql = f'{values_sql}?, '
if dict_key != 'store':
update_sql = f'{update_sql}{dict_key}=?, '
update_values.append(value_json['store'][dict_key])
columns = columns[:-2] + ')'
values_sql = values_sql[:-2] + ')'
update_sql = update_sql[:-2]
update_values.append(value_json["store"]["store"])
try:
sql_statement = f'INSERT INTO store {columns} VALUES {values_sql}'
cursor.execute(sql_statement, values)
stores[value_json["store"]["store"]] = cursor.lastrowid
except sqlite3.IntegrityError as err:
try:
sql_statement = f'UPDATE store SET {update_sql} WHERE store=?'
cursor.execute(sql_statement, tuple(update_values))
except Exception as err:
print(err)
except Exception as err:
print(err)
def insert_update_product(connection, stores, value_json):
cursor = connection.cursor()
columns = '(store, '
values_sql = f'(?, '
values = [stores[value_json["store"]]]
update_sql = ''
update_values = []
for dict_key in value_json['product']:
columns = f'{columns}{dict_key}, '
values.append(value_json['product'][dict_key])
values_sql = f'{values_sql}?, '
if dict_key != 'sku_code':
update_sql = f'{update_sql}{dict_key}=?, '
update_values.append(value_json['product'][dict_key])
columns = columns[:-2] + ')'
values_sql = values_sql[:-2] + ')'
update_sql = update_sql[:-2]
update_values.append(stores[value_json["store"]])
update_values.append(value_json["product"]["sku_code"])
product_id = None
try:
sql_statement = f'INSERT INTO products {columns} VALUES {values_sql}'
cursor.execute(sql_statement, values)
product_id = cursor.lastrowid
#print(f'inserted {product_id}')
except sqlite3.IntegrityError as err:
try:
sql_statement = f'UPDATE products SET {update_sql} WHERE store=? and sku_code=?'
cursor.execute(sql_statement, tuple(update_values))
sql_statement = f'SELECT id FROM products WHERE store=? and sku_code=?'
cursor.execute(sql_statement, (stores[value_json["store"]], value_json["product"]["sku_code"]))
product_id = cursor.fetchone()[0]
#print(f'updated {product_id}')
except Exception as err:
print(err)
except Exception as err:
print(err)
insert_update_price(connection, stores, value_json, product_id)
def insert_update_price(connection, stores, value_json, product_id):
cursor = connection.cursor()
columns = '(product_id, last_update, '
values_sql = f'(?, ?, '
values = [product_id, datetime.now().strftime('%d/%m/%Y')]
for dict_key in value_json['price']:
columns = f'{columns}{dict_key}, '
values.append(value_json['price'][dict_key])
values_sql = f'{values_sql}?, '
columns = columns[:-2] + ')'
values_sql = values_sql[:-2] + ')'
price_exists = False
try:
sql_statement = f'SELECT id FROM price WHERE product_id=? and price=? and active=1'
cursor.execute(sql_statement, (product_id, value_json["price"]["price"]))
if cursor.fetchone():
price_exists = True
except Exception as err:
print(err)
try:
if not price_exists:
sql_statement = f'INSERT INTO price {columns} VALUES {values_sql}'
cursor.execute(sql_statement, values)
else:
sql_statement = f'UPDATE price SET last_update=? WHERE product_id=? and price=? and active=1'
cursor.execute(sql_statement, (datetime.now().strftime("%d/%m/%Y"), product_id, value_json["price"]["price"]))
except sqlite3.IntegrityError as err:
print(err)
except Exception as err:
print(err)
def deactivate_old_price(connection, stores, value_json):
cursor = connection.cursor()
try:
sql_statement=f'UPDATE price SET active=0 WHERE product_id IN (SELECT id FROM products WHERE store=?) AND last_update<?'
cursor.execute(sql_statement, (stores[value_json["store"]], datetime.now().strftime("%d/%m/%Y")))
#print(sql_statement)
except Exception as err:
print(err)
def insert(connection, stores, value_json):
if value_json['type'] == 'store':
print('insert_update_store')
insert_update_store(connection, stores, value_json)
if value_json['type'] == 'store_update':
print('deactivate_old_price')
deactivate_old_price(connection, stores, value_json)
if value_json['type'] == 'product':
#print('insert_update_product')
insert_update_product(connection, stores, value_json)
def get_stores(connection):
stores = {}
cursor = con.cursor()
sql_statement = 'SELECT store, id FROM store'
cursor.execute(sql_statement)
result = cursor.fetchall()
for res in result:
stores[res[0]] = res[1]
return stores
if __name__ == '__main__':
config = common.get_config()
con = sqlite3.connect(config['config']['sqlitedb'], check_same_thread=False)
if config['config']['log_sql']:
con.set_trace_callback(print)
stores = get_stores(con)
commit_thread = threading.Thread(target=committer, args=(con,), daemon=True)
commit_thread.start()
consumer = KafkaConsumer('shopper_db',bootstrap_servers=[config['config']['kafka_boostrap_servers']])
for msg in consumer:
value_json = json.loads(msg.value.decode("utf-8"))
insert(con, stores, value_json)

+ 89
- 0
parser_action.py View File

@ -0,0 +1,89 @@
#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import traceback
import sys
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
def get_categories(url):
page = requests.get(url)
root = ET.fromstring(page.content)
categories = {}
for cat_elm in root.findall("./Data/CategoryNavigationViewModel"):
cat_label = cat_elm.find("Label")
cat_url = cat_elm.find("Url")
categories[cat_label.text] = cat_url.text
return categories
def get_products(config, cat_url, kafka_producer):
url = config['action']['products_url']
url = url.replace("__url__", cat_url)
url = url.replace("__offset__", "0")
total_url = url.replace("__limit__", "1")
products_cookie = config['action']['products_cookie']
get_total = requests.get(total_url, cookies=products_cookie)
data_json = json.loads(unidecode.unidecode(get_total.content.decode('utf-8', 'ignore')))
totalCount = data_json['totalCount']
all_url = url.replace("__limit__", str(totalCount))
products = requests.get(all_url, cookies=products_cookie)
products_json = json.loads(unidecode.unidecode(products.content.decode('utf-8', 'ignore')))
for product_json in products_json['items']:
try:
if product_json['type'] != 'product':
continue
product = {}
price = {}
product['title'] = product_json['title']
product['url'] = config['action']['url'] + product_json['url']
product['image_url'] = config['action']['url'] + product_json['imageUrl']
product['info'] = product_json['subTitle']
product['sku_code'] = product_json['code']
for spec in product_json['specifications']:
if spec['id'] == 'attEANCodeVariant':
product['ean_code'] = spec['value']
if spec['id'] == 'attLongDescription':
product['description'] = spec['value']
product['brand'] = product_json['brandName']
price['price'] = product_json['price']
price['promo'] = 0 if product_json['isDeal'] == False else 1
price['promo_start'] = product_json['dealStartDate']
price['promo_end'] = product_json['dealEndDate']
common.add_product(kafka_producer, config['action']['name'], product, price)
except Exception as err:
common.dump_failed_product(config['action']['name'], cat_url, product_json, err, traceback.format_exc())
if __name__ == "__main__" :
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['action']['name'], 'url': config['action']['url'], 'image_url': config['action']['logo']})
common.clear_failed_product(config['action']['name'])
categories = get_categories(config['action']['categories_url'])
for cat in categories:
soup = common.get_soup_page_no_limit(f"{config['action']['url']}{categories[cat]}")
subcat = soup.find_all("a", class_="subcategory-cta-list__cta")
for sc in subcat:
get_products(config, sc['href'], producer)
common.update_store_prices(producer, config['action'])

+ 158
- 0
parser_brico.py View File

@ -0,0 +1,158 @@
#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import re
import traceback
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
def get_categories(url, cookie):
soup = common.get_soup_page_no_limit(url, cookie)
categories = {}
dropdown = soup.find('div', class_="mxd-mega-dropdown")
anchors = dropdown.find_all('a')
for anchor in anchors:
if anchor['href'].find('bricosolar') >= 0:
print(f'remove {anchor["href"]}')
pass
elif anchor['href'] not in categories.values():
if anchor['href'].find('https://www.brico.be') == -1:
anchor['href'] = f'https://www.brico.be{anchor["href"]}'
categories[anchor.text] = anchor['href']
return categories
def get_product_details(prod_url, config, kafka_producer, failed):
try:
soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie'])
prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout')
if prod_detail is None:
return
prod_specs = soup.find('div', {'id': 'specs'})
title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1')
product = {}
price = {}
product['title'] = title.find("span", {'itemprop': 'name'}).text
product['url'] = prod_url
product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src']
product['info'] = prod_specs.find_all("p")[0].text
product['sku_code'] = prod_url.split("/")[-1]
ean_pattern = "([0-9]{8,13})"
if len(prod_specs.find_all("p")) > 1:
ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text)
product['ean_code'] = ean_match.group(1)
pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)'
pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"'
scripts = soup.find_all("script")
for script in scripts:
if script.string is not None:
if script.string.find('"brands"') >= 0:
match = re.search(pattern_brand, script.string)
product['brand'] = match.group(1)
if script.string.find('"ean"') >= 0 and 'ean_code' not in product:
match = re.search(pattern_ean, script.string)
product['ean_code'] = match.group(1)
if soup.find("ins") is None:
return
price['price'] = soup.find("ins").find("meta")['content']
price['promo'] = 0 if soup.find("del") is None else 1
common.add_product(kafka_producer, config['brico']['name'], product, price)
except Exception as err:
common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc())
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
maxthreads = 10
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__" :
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']})
common.clear_failed_product(config['brico']['name'])
all_categories = get_categories(config['brico']['url'], config['brico']['cookie'])
process_categories = all_categories.copy()
url = next_url(process_categories)
products = {}
failed_prod = {}
i = 0
while url is not None:
i = i + 1
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
soup = common.get_soup_page_no_limit(url, config['brico']['cookie'])
page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive")
extra_cat = soup.find_all("a", class_="mxd-product-list-link")
if len(extra_cat) > 0:
for cat in extra_cat:
if cat['href'].find('bricosolar') >= 0:
print(f'main remove {cat["href"]}')
pass
elif cat['href'] not in all_categories.values():
cat_name = cat.find("span").text
if cat['href'].find("/") > 0:
cat['href'] = f'/{cat["href"]}'
process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}'
#print(f'added {config["brico"]["url"]}{cat["href"]}')
all_categories[cat_name] = cat['href']
for product in page_products:
product = product.find("a", class_="mxd-block-card-link")
if product['href'].find("/nl") == -1:
product['href'] = f'/nl{product["href"]}'
if product['href'].find("/") > 0:
product['href'] = f'/{product["href"]}'
if product['href'] not in products.values():
prod_name = product.find("span", {"itemprop": "name"}).text
#print(f'product url: {product["href"]}')
products[prod_name] = product["href"]
thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod))
threads.append(thread)
thread.start()
next_page = soup.find("a", {"rel": "next"})
if next_page is None:
url = next_url(process_categories)
else:
base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"})
url = f'{base["href"]}{next_page["href"]}'
print(f'next page: {url}')
for t in threads:
t.join()
common.update_store_prices(producer, config['brico'])

+ 153
- 0
parser_brouwland.py View File

@ -0,0 +1,153 @@
#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import re
import traceback
import sys
import itertools
import logging
#logging.basicConfig(level=logging.DEBUG)
from time import sleep
from random import randint
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
from ratelimit import limits, sleep_and_retry
def get_categories(url):
soup = common.get_soup_page(url)
categories = {}
articles = soup.find_all('article', class_="item bg-gray")
for article in articles:
anchor = article.find('a')
anchor_title = anchor['title'].split(' | ')[0]
categories[anchor_title] = anchor['href']
return categories
def get_product_details(prod_url, config, kafka_producer, anchor_title):
try:
soup = common.get_soup_page(prod_url)
if soup is None:
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
return
prod_detail = soup.find('article')
if prod_detail is None:
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
return
title = prod_detail.find('h1', {'itemprop': 'name'})
image = prod_detail.find('a', class_='image-click').find('img')
description = prod_detail.find('div', {'itemprop': 'description'})
ean_code = prod_detail.find('td', {'itemprop': 'eancode'})
sku = prod_detail.find('span', {'itemprop': 'sku'})
brand = prod_detail.find('span', {'itemprop': 'brand'})
price_detail = prod_detail.find('span', {'itemprop': 'price'})
product = {}
price = {}
if title is not None:
product['title'] = title.text.strip()
product['url'] = prod_url
product['image_url'] = image['src']
if description is not None:
product['info'] = description.text.strip()
if sku is not None:
product['sku_code'] = sku.text.strip()
if ean_code is not None:
product['ean_code'] = ean_code.text.strip()
if brand is not None:
product['brand'] = brand.text.strip()
price['price'] = price_detail.text.split()[1]
price['promo'] = 0 if soup.find("del") is None else 1
common.add_product(kafka_producer, config['brouwland']['name'], product, price)
except Exception as err:
print(traceback.format_exc())
common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc())
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
maxthreads = 5
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__" :
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']})
common.clear_failed_product(config['brouwland']['name'])
all_categories = get_categories(config['brouwland']['catalogue_url'])
process_categories = all_categories.copy()
url = next_url(process_categories)
products = {}
i = 0
while url is not None:
i = i + 1
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
url = f'{config["brouwland"]["url"]}{url}'
soup = common.get_soup_page(url)
if soup is None:
url = next_url(process_categories)
continue
page_products = soup.find_all("article", class_="product")
extra_cat = soup.find_all("article", class_="item bg-gray")
if len(extra_cat) > 0:
for cat in extra_cat:
anchor = cat.find('a')
if anchor['href'] not in all_categories.values():
anchor_title = anchor['title'].split(' | ')[0]
process_categories[anchor_title] = f'{anchor["href"]}'
all_categories[anchor_title] = anchor['href']
#print(f'added {cat["data-href"]}')
for product in page_products:
anchor = product.find("a")
if anchor['href'] not in products.values():
anchor_title = anchor['title'].split(' | ')[0]
if anchor_title.upper().find("CADEAU") > -1:
continue
products[anchor_title] = anchor["href"]
thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title))
threads.append(thread)
thread.start()
next_page = soup.find("a", class_="next")
if next_page is None:
url = next_url(process_categories)
else:
url = next_page["href"]
for t in threads:
t.join()
common.update_store_prices(producer, config['brouwland'])

+ 180
- 0
parser_dreamland.py View File

@ -0,0 +1,180 @@
#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import re
import traceback
import sys
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
from ratelimit import limits, sleep_and_retry
def get_categories(url):
soup = dreamland_get_soup(url)
categories = {}
main_categories = soup.find_all("p", class_="subNav__categoryTitle")
for cat in main_categories:
anchor = cat.find("a")
categories[anchor.text] = anchor['href']
return categories
@sleep_and_retry
@limits(calls=1, period=2)
def dreamland_get_soup(url, payload=None, cookie=None, headers={}):
return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers)
def get_product_details(prod_url, config, kafka_producer):
try:
soup = dreamland_get_soup(prod_url)
product = {}
price = {}
sku = soup.find("span", class_="sku").text.split(":")[1]
title = soup.find("h1", class_="main_header").text
image = soup.find("img", {'id': 'productMainImage'})['src']
desc = soup.find("div", class_="product_text").text
attrs = soup.find("div", {'id': 'Attributes_table'})
items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')})
ean_list = []
for item in items:
if item.text.find('Merk') > -1:
product['brand'] = item.find("span").text.strip('">').strip()
if item.text.find('EAN') > -1:
ean_codes= item.find_all("span")
for code in ean_codes:
ean = code.text.strip('">').strip()
ean_list.append(ean)
product['sku_code'] = sku
product['url'] = prod_url
product['title'] = title
product['image_url'] = image
product['info'] = desc
product['ean_code'] = ", ".join(ean_list)
if soup.find("div", class_="price red mini") is None:
price['price'] = soup.find("div", class_="product_price").text.strip("\xa0")
price['promo'] = 0
else:
price['price'] = soup.find("div", class_="price red mini").text.strip("\xa0")
price['promo'] = 1
common.add_product(kafka_producer, config['dreamland']['name'], product, price)
except Exception as err:
common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc())
def get_productListView(soup):
scripts = soup.find_all("script")
list_view_pattern = "\'(ProductListingView.*)',{"
listview = None
if scripts is None:
return listview
for script in scripts:
if script.string is not None:
if script.string.find("ProductListingView") > 0:
listview_match = re.search(list_view_pattern, script.string)
listview = listview_match.group(1)
return listview
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
def get_dreamland_productListingView(url, index=0):
soup = None
#payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax'
headers = {}
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
headers['content-type'] = 'application/x-www-form-urlencoded'
payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax'
soup = dreamland_get_soup(url, payload=payload, headers=headers)
return soup
maxthreads = 2
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__":
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']})
common.clear_failed_product(config['dreamland']['name'])
all_categories = get_categories(config['dreamland']['url'])
process_categories = all_categories.copy()
url = next_url(process_categories)
products = {}
i = 0
while url is not None:
i = i + 1
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
soup = common.get_soup_page(url)
if soup is None:
url = next_url(process_categories)
continue
product_listview = get_productListView(soup)
extra_cat = soup.find_all("li", class_="singleFacet")
if len(extra_cat) > 0:
for cat in extra_cat:
anchor = cat.find("a")
if anchor['href'] not in all_categories.values():
anchor_title = anchor.find("span", class_="facetName").text
#print(f'added {anchor_title}- {anchor["href"]}')
process_categories[anchor_title] = anchor["href"]
all_categories[anchor_title] = anchor['href']
index = 0
while product_listview is not None:
view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index)
#view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}')
view_products = view_soup.find_all("div", class_="product_info")
for product in view_products:
index = index + 1
name = product.find("div", class_="product_name")
anchor = product.find("a")
if anchor['href'] not in products.values():
products[name] = anchor["href"]
thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer))
threads.append(thread)
thread.start()
next_arrow = view_soup.find("a", class_="right_arrow")
if next_arrow is None:
product_listview = None
url = next_url(process_categories)
for t in threads:
t.join()
common.update_store_prices(producer, config['dreamland'])

+ 166
- 0
parser_fun.py View File

@ -0,0 +1,166 @@
#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import traceback
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
def get_categories(cat_url):
page = requests.get(cat_url)
content = page.content[1:-1].decode('utf-8')
content = content.replace("\\r\\n", "")
content = content.replace('\\/', '/')
content = content.replace('\\"', '"')
content = content.replace(' & ', '&amp;')
root = ET.fromstring(content)
categories = {}
for elm in list(root.iter()):
if elm.tag == "a":
if (elm.attrib['href'] == '#') or (elm.text is None):
continue
if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0):
continue
if elm.attrib['href'] not in categories.values():
categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "")
return categories
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
def get_product_details(prod_url, config, kafka_producer):
soup = common.get_soup_page_no_limit(prod_url)
if soup is None:
return
try:
prod_view = soup.find("div", class_="product-view")
prod_essential = prod_view.find("div", class_="product-essential")
image = prod_view.find("img", {'itemprop': 'image'})['src']
price = prod_essential.find("meta", {'itemprop': 'price'})['content']
special_price = prod_essential.find("p", class_="special-price")
promo = False
promo_end = None
if special_price is not None:
promo = True
promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"})
if promo_end is not None:
promo_end = promo_end['content']
title = prod_view.find("h1", {'itemprop': 'name'}).text
sku = prod_view.find("meta", {'itemprop': 'sku'})['content']
brand = prod_view.find("meta", {'itemprop': 'brand'})['content']
description = prod_view.find("meta", {'itemprop': 'description'})['content']
specs = prod_view.find("div", class_="tab-content tab-block-additional")
spec_li = specs.find_all("li")
info = ''
for spec in spec_li:
label = spec.find("span", class_="label").text
content = spec.find("span", class_="data").text
if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1:
info = f'{info}{label}: {content} / '
info = info[:-3]
ean_code = ''
ean_list = prod_view.find_all("li")
for elm in ean_list:
if elm is not None:
if elm.text.upper().find("EAN") >= 0:
ean_code = elm.find("span", class_="data").text
product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info}
price_details = {'price': price, "promo": promo, "promo_end": promo_end}
common.add_product(kafka_producer, config['fun']['name'], product_details, price_details)
except Exception as err:
common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc())
maxthreads = 10
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__":
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']})
common.clear_failed_product(config['fun']['name'])
all_categories = get_categories(config['fun']['categories_url'])
process_categories = all_categories.copy()
url = next_url(process_categories)
products = {}
i = 0
while url is not None:
i = i + 1
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
cat_url = config["fun"]["url"] + url
soup = common.get_soup_page_no_limit(cat_url)
subcat = soup.find_all("li", class_="item last")
if len(subcat) == 0:
big_urls = soup.find_all("big")
for b_url in big_urls:
b_href = b_url.find("a")
if b_href:
try:
b_url = b_href['href'].replace(config["fun"]["url"], "")
if b_url not in all_categories.values():
process_categories[b_href.text] = b_url
all_categories[b_href.text] = b_url
except Exception as err:
print(url)
print("+++++++++++++")
print(cat_url)
print("+++++++++++++")
print(b_href)
print("+++++++++++++")
print(err)
print("=============")
url = next_url(process_categories)
else:
for sc in subcat:
product = sc.find("h2", class_="product-name")
p_info = product.find("a")
if p_info['href'] not in products.values():
products[p_info['title']] = p_info['href']
thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer))
threads.append(thread)
thread.start()
next_page = soup.find("a", class_="next i-next")
if next_page is None:
url = next_url(process_categories)
else:
url = next_page['href'].replace(config["fun"]["url"], "")
for t in threads:
t.join()
common.update_store_prices(producer, config['fun'])

+ 95
- 0
parser_hubo.py View File

@ -0,0 +1,95 @@
#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import re
import traceback
import sys
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
def get_product_details(prod_url, config, kafka_producer, json_product):
try:
soup = common.get_soup_page_no_limit(prod_url)
description = soup.find('div', class_='pdp-description__content')
ean_row = soup.find_all('div', class_='row border-bottom')
product = {}
price = {}
product['title'] = json_product['title']
product['url'] = prod_url
if 'image' in json_product:
product['image_url'] = json_product['image']
if description.find("p") is not None:
product['info'] = description.find("p").text
product['sku_code'] = json_product['sku']
for row in ean_row:
if row.text.find('EAN- / barcode') > -1:
ean_pattern = "([0-9]{8,13})"
ean_match = re.search(ean_pattern, row.text)
product['ean_code'] = ean_match.group(1)
if 'brand' in json_product:
product['brand'] = json_product['brand']
price['price'] = json_product['price']
if 'discount' in json_product:
price['promo'] = 1
else:
price['promo'] = 0
common.add_product(kafka_producer, config['hubo']['name'], product, price)
except Exception as err:
common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc())
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
maxthreads = 5
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__" :
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']})
common.clear_failed_product(config['hubo']['name'])
get_doc = requests.get(config['hubo']['products_url'])
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
offset = 1
while len(data_json['docs']) > 0:
print(f'{offset}/{data_json["doc_count"]}')
for product in data_json['docs']:
thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product))
threads.append(thread)
thread.start()
offset = offset + int(data_json['limit'])
get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset)))
data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
for t in threads:
t.join()
common.update_store_prices(producer, config['hubo'])

Loading…
Cancel
Save