You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

180 lines
7.3 KiB

#!/usr/bin/env python3
import requests
import logging
import xml.etree.ElementTree as ET
import json
import unidecode
import sqlite3
import common
import threading
import re
import traceback
import sys
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
from ratelimit import limits, sleep_and_retry
def get_categories(url):
soup = dreamland_get_soup(url)
categories = {}
main_categories = soup.find_all("p", class_="subNav__categoryTitle")
for cat in main_categories:
anchor = cat.find("a")
categories[anchor.text] = anchor['href']
return categories
@sleep_and_retry
@limits(calls=1, period=2)
def dreamland_get_soup(url, payload=None, cookie=None, headers={}):
return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers)
def get_product_details(prod_url, config, kafka_producer):
try:
soup = dreamland_get_soup(prod_url)
product = {}
price = {}
sku = soup.find("span", class_="sku").text.split(":")[1]
title = soup.find("h1", class_="main_header").text
image = soup.find("img", {'id': 'productMainImage'})['src']
desc = soup.find("div", class_="product_text").text
attrs = soup.find("div", {'id': 'Attributes_table'})
items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')})
ean_list = []
for item in items:
if item.text.find('Merk') > -1:
product['brand'] = item.find("span").text.strip('">').strip()
if item.text.find('EAN') > -1:
ean_codes= item.find_all("span")
for code in ean_codes:
ean = code.text.strip('">').strip()
ean_list.append(ean)
product['sku_code'] = sku
product['url'] = prod_url
product['title'] = title
product['image_url'] = image
product['info'] = desc
product['ean_code'] = ", ".join(ean_list)
if soup.find("div", class_="price red mini") is None:
price['price'] = soup.find("div", class_="product_price").text.strip("\xa0")
price['promo'] = 0
else:
price['price'] = soup.find("div", class_="price red mini").text.strip("\xa0")
price['promo'] = 1
common.add_product(kafka_producer, config['dreamland']['name'], product, price)
except Exception as err:
common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc())
def get_productListView(soup):
scripts = soup.find_all("script")
list_view_pattern = "\'(ProductListingView.*)',{"
listview = None
if scripts is None:
return listview
for script in scripts:
if script.string is not None:
if script.string.find("ProductListingView") > 0:
listview_match = re.search(list_view_pattern, script.string)
listview = listview_match.group(1)
return listview
def next_url(url_list):
if len(url_list) == 0:
return None
key = next(iter(url_list))
url = url_list[key]
del url_list[key]
return url
def get_dreamland_productListingView(url, index=0):
soup = None
#payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax'
headers = {}
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
headers['content-type'] = 'application/x-www-form-urlencoded'
payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax'
soup = dreamland_get_soup(url, payload=payload, headers=headers)
return soup
maxthreads = 2
sema = threading.Semaphore(value=maxthreads)
threads = list()
if __name__ == "__main__":
config = common.get_config()
producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']})
common.clear_failed_product(config['dreamland']['name'])
all_categories = get_categories(config['dreamland']['url'])
process_categories = all_categories.copy()
url = next_url(process_categories)
products = {}
i = 0
while url is not None:
i = i + 1
print(f'{i}/{len(all_categories)} - {len(process_categories)}')
soup = common.get_soup_page(url)
if soup is None:
url = next_url(process_categories)
continue
product_listview = get_productListView(soup)
extra_cat = soup.find_all("li", class_="singleFacet")
if len(extra_cat) > 0:
for cat in extra_cat:
anchor = cat.find("a")
if anchor['href'] not in all_categories.values():
anchor_title = anchor.find("span", class_="facetName").text
#print(f'added {anchor_title}- {anchor["href"]}')
process_categories[anchor_title] = anchor["href"]
all_categories[anchor_title] = anchor['href']
index = 0
while product_listview is not None:
view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index)
#view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}')
view_products = view_soup.find_all("div", class_="product_info")
for product in view_products:
index = index + 1
name = product.find("div", class_="product_name")
anchor = product.find("a")
if anchor['href'] not in products.values():
products[name] = anchor["href"]
thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer))
threads.append(thread)
thread.start()
next_arrow = view_soup.find("a", class_="right_arrow")
if next_arrow is None:
product_listview = None
url = next_url(process_categories)
for t in threads:
t.join()
common.update_store_prices(producer, config['dreamland'])