|
|
- #!/usr/bin/env python3
-
- import requests
- import logging
- import xml.etree.ElementTree as ET
- import json
- import unidecode
- import sqlite3
- import common
- import threading
- import re
- import traceback
- import sys
-
- from bs4 import BeautifulSoup
- from datetime import datetime
- from kafka import KafkaProducer
- from ratelimit import limits, sleep_and_retry
-
-
- def get_categories(url):
- soup = dreamland_get_soup(url)
-
- categories = {}
-
- main_categories = soup.find_all("p", class_="subNav__categoryTitle")
- for cat in main_categories:
- anchor = cat.find("a")
- categories[anchor.text] = anchor['href']
-
- return categories
-
-
- @sleep_and_retry
- @limits(calls=1, period=2)
- def dreamland_get_soup(url, payload=None, cookie=None, headers={}):
- return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers)
-
-
- def get_product_details(prod_url, config, kafka_producer):
- try:
- soup = dreamland_get_soup(prod_url)
-
- product = {}
- price = {}
- sku = soup.find("span", class_="sku").text.split(":")[1]
- title = soup.find("h1", class_="main_header").text
- image = soup.find("img", {'id': 'productMainImage'})['src']
- desc = soup.find("div", class_="product_text").text
- attrs = soup.find("div", {'id': 'Attributes_table'})
- items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')})
- ean_list = []
- for item in items:
- if item.text.find('Merk') > -1:
- product['brand'] = item.find("span").text.strip('">').strip()
- if item.text.find('EAN') > -1:
- ean_codes= item.find_all("span")
- for code in ean_codes:
- ean = code.text.strip('">').strip()
- ean_list.append(ean)
-
- product['sku_code'] = sku
- product['url'] = prod_url
- product['title'] = title
- product['image_url'] = image
- product['info'] = desc
- product['ean_code'] = ", ".join(ean_list)
-
-
- if soup.find("div", class_="price red mini") is None:
- price['price'] = soup.find("div", class_="product_price").text.strip("€\xa0")
- price['promo'] = 0
- else:
- price['price'] = soup.find("div", class_="price red mini").text.strip("€\xa0")
- price['promo'] = 1
-
- common.add_product(kafka_producer, config['dreamland']['name'], product, price)
- except Exception as err:
- common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc())
-
-
- def get_productListView(soup):
- scripts = soup.find_all("script")
- list_view_pattern = "\'(ProductListingView.*)',{"
- listview = None
- if scripts is None:
- return listview
- for script in scripts:
- if script.string is not None:
- if script.string.find("ProductListingView") > 0:
- listview_match = re.search(list_view_pattern, script.string)
- listview = listview_match.group(1)
- return listview
-
-
- def next_url(url_list):
- if len(url_list) == 0:
- return None
-
- key = next(iter(url_list))
- url = url_list[key]
- del url_list[key]
- return url
-
-
- def get_dreamland_productListingView(url, index=0):
- soup = None
- #payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax'
- headers = {}
- headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
- headers['content-type'] = 'application/x-www-form-urlencoded'
- payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax'
-
- soup = dreamland_get_soup(url, payload=payload, headers=headers)
-
- return soup
-
-
- maxthreads = 2
- sema = threading.Semaphore(value=maxthreads)
- threads = list()
-
-
- if __name__ == "__main__":
- config = common.get_config()
-
- producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
- common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']})
- common.clear_failed_product(config['dreamland']['name'])
-
- all_categories = get_categories(config['dreamland']['url'])
-
- process_categories = all_categories.copy()
- url = next_url(process_categories)
- products = {}
- i = 0
- while url is not None:
- i = i + 1
- print(f'{i}/{len(all_categories)} - {len(process_categories)}')
- soup = common.get_soup_page(url)
- if soup is None:
- url = next_url(process_categories)
- continue
-
- product_listview = get_productListView(soup)
- extra_cat = soup.find_all("li", class_="singleFacet")
-
- if len(extra_cat) > 0:
- for cat in extra_cat:
- anchor = cat.find("a")
- if anchor['href'] not in all_categories.values():
- anchor_title = anchor.find("span", class_="facetName").text
- #print(f'added {anchor_title}- {anchor["href"]}')
- process_categories[anchor_title] = anchor["href"]
- all_categories[anchor_title] = anchor['href']
- index = 0
- while product_listview is not None:
- view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index)
- #view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}')
- view_products = view_soup.find_all("div", class_="product_info")
- for product in view_products:
- index = index + 1
- name = product.find("div", class_="product_name")
- anchor = product.find("a")
- if anchor['href'] not in products.values():
- products[name] = anchor["href"]
-
- thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer))
- threads.append(thread)
- thread.start()
- next_arrow = view_soup.find("a", class_="right_arrow")
- if next_arrow is None:
- product_listview = None
-
- url = next_url(process_categories)
-
- for t in threads:
- t.join()
-
- common.update_store_prices(producer, config['dreamland'])
|