|
#!/usr/bin/env python3
|
|
|
|
import requests
|
|
import logging
|
|
import xml.etree.ElementTree as ET
|
|
import json
|
|
import unidecode
|
|
import sqlite3
|
|
import common
|
|
import threading
|
|
import re
|
|
import traceback
|
|
import sys
|
|
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
from kafka import KafkaProducer
|
|
from ratelimit import limits, sleep_and_retry
|
|
|
|
|
|
def get_categories(url):
|
|
soup = dreamland_get_soup(url)
|
|
|
|
categories = {}
|
|
|
|
main_categories = soup.find_all("p", class_="subNav__categoryTitle")
|
|
for cat in main_categories:
|
|
anchor = cat.find("a")
|
|
categories[anchor.text] = anchor['href']
|
|
|
|
return categories
|
|
|
|
|
|
@sleep_and_retry
|
|
@limits(calls=1, period=2)
|
|
def dreamland_get_soup(url, payload=None, cookie=None, headers={}):
|
|
return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers)
|
|
|
|
|
|
def get_product_details(prod_url, config, kafka_producer):
|
|
try:
|
|
soup = dreamland_get_soup(prod_url)
|
|
|
|
product = {}
|
|
price = {}
|
|
sku = soup.find("span", class_="sku").text.split(":")[1]
|
|
title = soup.find("h1", class_="main_header").text
|
|
image = soup.find("img", {'id': 'productMainImage'})['src']
|
|
desc = soup.find("div", class_="product_text").text
|
|
attrs = soup.find("div", {'id': 'Attributes_table'})
|
|
items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')})
|
|
ean_list = []
|
|
for item in items:
|
|
if item.text.find('Merk') > -1:
|
|
product['brand'] = item.find("span").text.strip('">').strip()
|
|
if item.text.find('EAN') > -1:
|
|
ean_codes= item.find_all("span")
|
|
for code in ean_codes:
|
|
ean = code.text.strip('">').strip()
|
|
ean_list.append(ean)
|
|
|
|
product['sku_code'] = sku
|
|
product['url'] = prod_url
|
|
product['title'] = title
|
|
product['image_url'] = image
|
|
product['info'] = desc
|
|
product['ean_code'] = ", ".join(ean_list)
|
|
|
|
|
|
if soup.find("div", class_="price red mini") is None:
|
|
price['price'] = soup.find("div", class_="product_price").text.strip("€\xa0")
|
|
price['promo'] = 0
|
|
else:
|
|
price['price'] = soup.find("div", class_="price red mini").text.strip("€\xa0")
|
|
price['promo'] = 1
|
|
|
|
common.add_product(kafka_producer, config['dreamland']['name'], product, price)
|
|
except Exception as err:
|
|
common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc())
|
|
|
|
|
|
def get_productListView(soup):
|
|
scripts = soup.find_all("script")
|
|
list_view_pattern = "\'(ProductListingView.*)',{"
|
|
listview = None
|
|
if scripts is None:
|
|
return listview
|
|
for script in scripts:
|
|
if script.string is not None:
|
|
if script.string.find("ProductListingView") > 0:
|
|
listview_match = re.search(list_view_pattern, script.string)
|
|
listview = listview_match.group(1)
|
|
return listview
|
|
|
|
|
|
def next_url(url_list):
|
|
if len(url_list) == 0:
|
|
return None
|
|
|
|
key = next(iter(url_list))
|
|
url = url_list[key]
|
|
del url_list[key]
|
|
return url
|
|
|
|
|
|
def get_dreamland_productListingView(url, index=0):
|
|
soup = None
|
|
#payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax'
|
|
headers = {}
|
|
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
|
|
headers['content-type'] = 'application/x-www-form-urlencoded'
|
|
payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax'
|
|
|
|
soup = dreamland_get_soup(url, payload=payload, headers=headers)
|
|
|
|
return soup
|
|
|
|
|
|
maxthreads = 2
|
|
sema = threading.Semaphore(value=maxthreads)
|
|
threads = list()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
config = common.get_config()
|
|
|
|
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
|
|
common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']})
|
|
common.clear_failed_product(config['dreamland']['name'])
|
|
|
|
all_categories = get_categories(config['dreamland']['url'])
|
|
|
|
process_categories = all_categories.copy()
|
|
url = next_url(process_categories)
|
|
products = {}
|
|
i = 0
|
|
while url is not None:
|
|
i = i + 1
|
|
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
|
|
soup = common.get_soup_page(url)
|
|
if soup is None:
|
|
url = next_url(process_categories)
|
|
continue
|
|
|
|
product_listview = get_productListView(soup)
|
|
extra_cat = soup.find_all("li", class_="singleFacet")
|
|
|
|
if len(extra_cat) > 0:
|
|
for cat in extra_cat:
|
|
anchor = cat.find("a")
|
|
if anchor['href'] not in all_categories.values():
|
|
anchor_title = anchor.find("span", class_="facetName").text
|
|
#print(f'added {anchor_title}- {anchor["href"]}')
|
|
process_categories[anchor_title] = anchor["href"]
|
|
all_categories[anchor_title] = anchor['href']
|
|
index = 0
|
|
while product_listview is not None:
|
|
view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index)
|
|
#view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}')
|
|
view_products = view_soup.find_all("div", class_="product_info")
|
|
for product in view_products:
|
|
index = index + 1
|
|
name = product.find("div", class_="product_name")
|
|
anchor = product.find("a")
|
|
if anchor['href'] not in products.values():
|
|
products[name] = anchor["href"]
|
|
|
|
thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer))
|
|
threads.append(thread)
|
|
thread.start()
|
|
next_arrow = view_soup.find("a", class_="right_arrow")
|
|
if next_arrow is None:
|
|
product_listview = None
|
|
|
|
url = next_url(process_categories)
|
|
|
|
for t in threads:
|
|
t.join()
|
|
|
|
common.update_store_prices(producer, config['dreamland'])
|