You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

179 lines
7.3 KiB

3 years ago
  1. #!/usr/bin/env python3
  2. import requests
  3. import logging
  4. import xml.etree.ElementTree as ET
  5. import json
  6. import unidecode
  7. import sqlite3
  8. import common
  9. import threading
  10. import re
  11. import traceback
  12. import sys
  13. from bs4 import BeautifulSoup
  14. from datetime import datetime
  15. from kafka import KafkaProducer
  16. from ratelimit import limits, sleep_and_retry
  17. def get_categories(url):
  18. soup = dreamland_get_soup(url)
  19. categories = {}
  20. main_categories = soup.find_all("p", class_="subNav__categoryTitle")
  21. for cat in main_categories:
  22. anchor = cat.find("a")
  23. categories[anchor.text] = anchor['href']
  24. return categories
  25. @sleep_and_retry
  26. @limits(calls=1, period=2)
  27. def dreamland_get_soup(url, payload=None, cookie=None, headers={}):
  28. return common.get_soup_page_no_limit(url, payload=payload, cookie=cookie, headers=headers)
  29. def get_product_details(prod_url, config, kafka_producer):
  30. try:
  31. soup = dreamland_get_soup(prod_url)
  32. product = {}
  33. price = {}
  34. sku = soup.find("span", class_="sku").text.split(":")[1]
  35. title = soup.find("h1", class_="main_header").text
  36. image = soup.find("img", {'id': 'productMainImage'})['src']
  37. desc = soup.find("div", class_="product_text").text
  38. attrs = soup.find("div", {'id': 'Attributes_table'})
  39. items = attrs.find_all('li', attrs={'class': re.compile('.*attribute.*')})
  40. ean_list = []
  41. for item in items:
  42. if item.text.find('Merk') > -1:
  43. product['brand'] = item.find("span").text.strip('">').strip()
  44. if item.text.find('EAN') > -1:
  45. ean_codes= item.find_all("span")
  46. for code in ean_codes:
  47. ean = code.text.strip('">').strip()
  48. ean_list.append(ean)
  49. product['sku_code'] = sku
  50. product['url'] = prod_url
  51. product['title'] = title
  52. product['image_url'] = image
  53. product['info'] = desc
  54. product['ean_code'] = ", ".join(ean_list)
  55. if soup.find("div", class_="price red mini") is None:
  56. price['price'] = soup.find("div", class_="product_price").text.strip("\xa0")
  57. price['promo'] = 0
  58. else:
  59. price['price'] = soup.find("div", class_="price red mini").text.strip("\xa0")
  60. price['promo'] = 1
  61. common.add_product(kafka_producer, config['dreamland']['name'], product, price)
  62. except Exception as err:
  63. common.dump_failed_product(config['dreamland']['name'], prod_url, soup, err, traceback.format_exc())
  64. def get_productListView(soup):
  65. scripts = soup.find_all("script")
  66. list_view_pattern = "\'(ProductListingView.*)',{"
  67. listview = None
  68. if scripts is None:
  69. return listview
  70. for script in scripts:
  71. if script.string is not None:
  72. if script.string.find("ProductListingView") > 0:
  73. listview_match = re.search(list_view_pattern, script.string)
  74. listview = listview_match.group(1)
  75. return listview
  76. def next_url(url_list):
  77. if len(url_list) == 0:
  78. return None
  79. key = next(iter(url_list))
  80. url = url_list[key]
  81. del url_list[key]
  82. return url
  83. def get_dreamland_productListingView(url, index=0):
  84. soup = None
  85. #payload='contentBeginIndex=0&productBeginIndex=__INDEX__&beginIndex=__INDEX__&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=1302&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=%20_12_-1011_4099276460824417158%0A&requesttype=ajax'
  86. headers = {}
  87. headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
  88. headers['content-type'] = 'application/x-www-form-urlencoded'
  89. payload = f'contentBeginIndex=0&productBeginIndex={index}&beginIndex={index}&orderBy=8&facetId=&pageView=grid&resultType=both&orderByContent=&searchTerm=&facet=&facetLimit=&minPrice=&maxPrice=&pageSize=&loadProductsList=true&storeId=13102&catalogId=15501&langId=-11&homePageURL=https%3A%2F%2Fwww.dreamland.be%2Fe%2Fnl%2Fdl%0A&commandContextCurrency=EUR&urlPrefixForHTTPS=https%3A%2F%2Fwww.dreamland.be%0A&urlPrefixForHTTP=https%3A%2F%2Fwww.dreamland.be%0A&wcc_integration_origin=&enableSKUListView=&widgetPrefix=12_4099276460824417158&pgl_widgetId=4099276460824417158&objectId=_12_-1011_4099276460824417158%0A&requesttype=ajax'
  90. soup = dreamland_get_soup(url, payload=payload, headers=headers)
  91. return soup
  92. maxthreads = 2
  93. sema = threading.Semaphore(value=maxthreads)
  94. threads = list()
  95. if __name__ == "__main__":
  96. config = common.get_config()
  97. producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
  98. common.insert_update_store(producer, {'store': config['dreamland']['name'], 'url': config['dreamland']['url'], 'image_url': config['dreamland']['logo']})
  99. common.clear_failed_product(config['dreamland']['name'])
  100. all_categories = get_categories(config['dreamland']['url'])
  101. process_categories = all_categories.copy()
  102. url = next_url(process_categories)
  103. products = {}
  104. i = 0
  105. while url is not None:
  106. i = i + 1
  107. print(f'{i}/{len(all_categories)} - {len(process_categories)}')
  108. soup = common.get_soup_page(url)
  109. if soup is None:
  110. url = next_url(process_categories)
  111. continue
  112. product_listview = get_productListView(soup)
  113. extra_cat = soup.find_all("li", class_="singleFacet")
  114. if len(extra_cat) > 0:
  115. for cat in extra_cat:
  116. anchor = cat.find("a")
  117. if anchor['href'] not in all_categories.values():
  118. anchor_title = anchor.find("span", class_="facetName").text
  119. #print(f'added {anchor_title}- {anchor["href"]}')
  120. process_categories[anchor_title] = anchor["href"]
  121. all_categories[anchor_title] = anchor['href']
  122. index = 0
  123. while product_listview is not None:
  124. view_soup = get_dreamland_productListingView(f'{config["dreamland"]["url"]}/{product_listview}', index)
  125. #view_soup = common.get_soup_page(f'{config["dreamland"]["url"]}/{product_listview}')
  126. view_products = view_soup.find_all("div", class_="product_info")
  127. for product in view_products:
  128. index = index + 1
  129. name = product.find("div", class_="product_name")
  130. anchor = product.find("a")
  131. if anchor['href'] not in products.values():
  132. products[name] = anchor["href"]
  133. thread = threading.Thread(target=get_product_details,args=(anchor["href"], config, producer))
  134. threads.append(thread)
  135. thread.start()
  136. next_arrow = view_soup.find("a", class_="right_arrow")
  137. if next_arrow is None:
  138. product_listview = None
  139. url = next_url(process_categories)
  140. for t in threads:
  141. t.join()
  142. common.update_store_prices(producer, config['dreamland'])