#!/usr/bin/env python3 import requests import logging import xml.etree.ElementTree as ET import json import unidecode import sqlite3 import common import threading import re import traceback import sys from bs4 import BeautifulSoup from datetime import datetime from kafka import KafkaProducer from ratelimit import limits, sleep_and_retry def get_categories(url): soup = dreamland_get_soup(url) categories = {} main_categories = soup.find_all("p", class_="subNav__categoryTitle") for cat in main_categories: anchor = cat.find("a") categories[anchor.text] = anchor['href'] return categories @sleep_and_retry @limits(calls=1, period=2) def dreamland_get_soup(url, payload=None, cookie=None, headers={}): return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers) def get_product_details(prod_url, config, kafka_producer): try: soup = dreamland_get_soup(prod_url) product = {} price = {} sku = soup.find("span", class_="sku").text.split(":")[1] title = soup.find("h1", class_="main_header").text image = soup.find("img", {'id': 'productMainImage'})['src'] desc = soup.find("div", class_="product_text").text attrs = soup.find("div", {'id': 'Attributes_table'}) items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')}) ean_list = [] for item in items: if item.text.find('Merk') > -1: product['brand'] = item.find("span").text.strip('">').strip() if item.text.find('EAN') > -1: ean_codes= item.find_all("span") for code in ean_codes: ean = code.text.strip('">').strip() ean_list.append(ean) product['sku_code'] = sku product['url'] = prod_url product['title'] = title product['image_url'] = image product['info'] = desc product['ean_code'] = ", ".join(ean_list) if soup.find("div", class_="price red mini") is None: price['price'] = soup.find("div", class_="product_price").text.strip("€\xa0") price['promo'] = 0 else: price['price'] = soup.find("div", class_="price red mini").text.strip("€\xa0") price['promo'] = 1 common.add_product(kafka_producer, config['dreamland']['name'], product, price) except Exception as err: common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc()) def get_productListView(soup): scripts = soup.find_all("script") list_view_pattern = "\'(ProductListingView.*)',{" listview = None if scripts is None: return listview for script in scripts: if script.string is not None: if script.string.find("ProductListingView") > 0: listview_match = re.search(list_view_pattern, script.string) listview = listview_match.group(1) return listview def next_url(url_list): if len(url_list) == 0: return None key = next(iter(url_list)) url = url_list[key] del url_list[key] return url def get_dreamland_productListingView(url, index=0): soup = None #payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax' headers = {} headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' headers['content-type'] = 'application/x-www-form-urlencoded' payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax' soup = dreamland_get_soup(url, payload=payload, headers=headers) return soup maxthreads = 2 sema = threading.Semaphore(value=maxthreads) threads = list() if __name__ == "__main__": config = common.get_config() producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']]) common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']}) common.clear_failed_product(config['dreamland']['name']) all_categories = get_categories(config['dreamland']['url']) process_categories = all_categories.copy() url = next_url(process_categories) products = {} i = 0 while url is not None: i = i + 1 print(f'{i}/{len(all_categories)} - {len(process_categories)}') soup = common.get_soup_page(url) if soup is None: url = next_url(process_categories) continue product_listview = get_productListView(soup) extra_cat = soup.find_all("li", class_="singleFacet") if len(extra_cat) > 0: for cat in extra_cat: anchor = cat.find("a") if anchor['href'] not in all_categories.values(): anchor_title = anchor.find("span", class_="facetName").text #print(f'added {anchor_title}- {anchor["href"]}') process_categories[anchor_title] = anchor["href"] all_categories[anchor_title] = anchor['href'] index = 0 while product_listview is not None: view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index) #view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}') view_products = view_soup.find_all("div", class_="product_info") for product in view_products: index = index + 1 name = product.find("div", class_="product_name") anchor = product.find("a") if anchor['href'] not in products.values(): products[name] = anchor["href"] thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer)) threads.append(thread) thread.start() next_arrow = view_soup.find("a", class_="right_arrow") if next_arrow is None: product_listview = None url = next_url(process_categories) for t in threads: t.join() common.update_store_prices(producer, config['dreamland'])