You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

153 lines
5.1 KiB

3 years ago
  1. #!/usr/bin/env python3
  2. import requests
  3. import logging
  4. import xml.etree.ElementTree as ET
  5. import json
  6. import unidecode
  7. import sqlite3
  8. import common
  9. import threading
  10. import re
  11. import traceback
  12. import sys
  13. import itertools
  14. import logging
  15. #logging.basicConfig(level=logging.DEBUG)
  16. from time import sleep
  17. from random import randint
  18. from bs4 import BeautifulSoup
  19. from datetime import datetime
  20. from kafka import KafkaProducer
  21. from ratelimit import limits, sleep_and_retry
  22. def get_categories(url):
  23. soup = common.get_soup_page(url)
  24. categories = {}
  25. articles = soup.find_all('article', class_="item bg-gray")
  26. for article in articles:
  27. anchor = article.find('a')
  28. anchor_title = anchor['title'].split(' | ')[0]
  29. categories[anchor_title] = anchor['href']
  30. return categories
  31. def get_product_details(prod_url, config, kafka_producer, anchor_title):
  32. try:
  33. soup = common.get_soup_page(prod_url)
  34. if soup is None:
  35. common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
  36. return
  37. prod_detail = soup.find('article')
  38. if prod_detail is None:
  39. common.dump_failed_product(config['brouwland']['name'], prod_url, soup, "No product details found", None)
  40. return
  41. title = prod_detail.find('h1', {'itemprop': 'name'})
  42. image = prod_detail.find('a', class_='image-click').find('img')
  43. description = prod_detail.find('div', {'itemprop': 'description'})
  44. ean_code = prod_detail.find('td', {'itemprop': 'eancode'})
  45. sku = prod_detail.find('span', {'itemprop': 'sku'})
  46. brand = prod_detail.find('span', {'itemprop': 'brand'})
  47. price_detail = prod_detail.find('span', {'itemprop': 'price'})
  48. product = {}
  49. price = {}
  50. if title is not None:
  51. product['title'] = title.text.strip()
  52. product['url'] = prod_url
  53. product['image_url'] = image['src']
  54. if description is not None:
  55. product['info'] = description.text.strip()
  56. if sku is not None:
  57. product['sku_code'] = sku.text.strip()
  58. if ean_code is not None:
  59. product['ean_code'] = ean_code.text.strip()
  60. if brand is not None:
  61. product['brand'] = brand.text.strip()
  62. price['price'] = price_detail.text.split()[1]
  63. price['promo'] = 0 if soup.find("del") is None else 1
  64. common.add_product(kafka_producer, config['brouwland']['name'], product, price)
  65. except Exception as err:
  66. print(traceback.format_exc())
  67. common.dump_failed_product(config['brouwland']['name'], prod_url, soup, err, traceback.format_exc())
  68. def next_url(url_list):
  69. if len(url_list) == 0:
  70. return None
  71. key = next(iter(url_list))
  72. url = url_list[key]
  73. del url_list[key]
  74. return url
  75. maxthreads = 5
  76. sema = threading.Semaphore(value=maxthreads)
  77. threads = list()
  78. if __name__ == "__main__" :
  79. config = common.get_config()
  80. producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
  81. common.insert_update_store(producer, {'store': config['brouwland']['name'], 'url': config['brouwland']['url'], 'image_url': config['brouwland']['logo']})
  82. common.clear_failed_product(config['brouwland']['name'])
  83. all_categories = get_categories(config['brouwland']['catalogue_url'])
  84. process_categories = all_categories.copy()
  85. url = next_url(process_categories)
  86. products = {}
  87. i = 0
  88. while url is not None:
  89. i = i + 1
  90. print(f'{i}/{len(all_categories)} - {len(process_categories)}')
  91. url = f'{config["brouwland"]["url"]}{url}'
  92. soup = common.get_soup_page(url)
  93. if soup is None:
  94. url = next_url(process_categories)
  95. continue
  96. page_products = soup.find_all("article", class_="product")
  97. extra_cat = soup.find_all("article", class_="item bg-gray")
  98. if len(extra_cat) > 0:
  99. for cat in extra_cat:
  100. anchor = cat.find('a')
  101. if anchor['href'] not in all_categories.values():
  102. anchor_title = anchor['title'].split(' | ')[0]
  103. process_categories[anchor_title] = f'{anchor["href"]}'
  104. all_categories[anchor_title] = anchor['href']
  105. #print(f'added {cat["data-href"]}')
  106. for product in page_products:
  107. anchor = product.find("a")
  108. if anchor['href'] not in products.values():
  109. anchor_title = anchor['title'].split(' | ')[0]
  110. if anchor_title.upper().find("CADEAU") > -1:
  111. continue
  112. products[anchor_title] = anchor["href"]
  113. thread = threading.Thread(target=get_product_details,args=(f'{config["brouwland"]["url"]}{anchor["href"]}', config, producer, anchor_title))
  114. threads.append(thread)
  115. thread.start()
  116. next_page = soup.find("a", class_="next")
  117. if next_page is None:
  118. url = next_url(process_categories)
  119. else:
  120. url = next_page["href"]
  121. for t in threads:
  122. t.join()
  123. common.update_store_prices(producer, config['brouwland'])