You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

165 lines
6.0 KiB

3 years ago
  1. #!/usr/bin/env python3
  2. import requests
  3. import logging
  4. import xml.etree.ElementTree as ET
  5. import json
  6. import unidecode
  7. import sqlite3
  8. import common
  9. import threading
  10. import traceback
  11. from bs4 import BeautifulSoup
  12. from datetime import datetime
  13. from kafka import KafkaProducer
  14. def get_categories(cat_url):
  15. page = requests.get(cat_url)
  16. content = page.content[1:-1].decode('utf-8')
  17. content = content.replace("\\r\\n", "")
  18. content = content.replace('\\/', '/')
  19. content = content.replace('\\"', '"')
  20. content = content.replace(' & ', '&')
  21. root = ET.fromstring(content)
  22. categories = {}
  23. for elm in list(root.iter()):
  24. if elm.tag == "a":
  25. if (elm.attrib['href'] == '#') or (elm.text is None):
  26. continue
  27. if (elm.text.upper().find("CADEAU") >=0) or (elm.text.upper().find("VOLLEDIG") >=0):
  28. continue
  29. if elm.attrib['href'] not in categories.values():
  30. categories[elm.text] = elm.attrib['href'].replace(config["fun"]["url"], "")
  31. return categories
  32. def next_url(url_list):
  33. if len(url_list) == 0:
  34. return None
  35. key = next(iter(url_list))
  36. url = url_list[key]
  37. del url_list[key]
  38. return url
  39. def get_product_details(prod_url, config, kafka_producer):
  40. soup = common.get_soup_page_no_limit(prod_url)
  41. if soup is None:
  42. return
  43. try:
  44. prod_view = soup.find("div", class_="product-view")
  45. prod_essential = prod_view.find("div", class_="product-essential")
  46. image = prod_view.find("img", {'itemprop': 'image'})['src']
  47. price = prod_essential.find("meta", {'itemprop': 'price'})['content']
  48. special_price = prod_essential.find("p", class_="special-price")
  49. promo = False
  50. promo_end = None
  51. if special_price is not None:
  52. promo = True
  53. promo_end = prod_essential.find("meta", {'itemprop': "priceValidUntil"})
  54. if promo_end is not None:
  55. promo_end = promo_end['content']
  56. title = prod_view.find("h1", {'itemprop': 'name'}).text
  57. sku = prod_view.find("meta", {'itemprop': 'sku'})['content']
  58. brand = prod_view.find("meta", {'itemprop': 'brand'})['content']
  59. description = prod_view.find("meta", {'itemprop': 'description'})['content']
  60. specs = prod_view.find("div", class_="tab-content tab-block-additional")
  61. spec_li = specs.find_all("li")
  62. info = ''
  63. for spec in spec_li:
  64. label = spec.find("span", class_="label").text
  65. content = spec.find("span", class_="data").text
  66. if label.upper().find("SKU") == -1 and label.upper().find("EAN") == -1:
  67. info = f'{info}{label}: {content} / '
  68. info = info[:-3]
  69. ean_code = ''
  70. ean_list = prod_view.find_all("li")
  71. for elm in ean_list:
  72. if elm is not None:
  73. if elm.text.upper().find("EAN") >= 0:
  74. ean_code = elm.find("span", class_="data").text
  75. product_details = {'title': title, 'url': prod_url, 'sku_code': sku, 'brand': brand, 'description': description, 'image_url': image, 'ean_code': ean_code, 'info': info}
  76. price_details = {'price': price, "promo": promo, "promo_end": promo_end}
  77. common.add_product(kafka_producer, config['fun']['name'], product_details, price_details)
  78. except Exception as err:
  79. common.dump_failed_product(config['fun']['name'], prod_url, soup, err, traceback.format_exc())
  80. maxthreads = 10
  81. sema = threading.Semaphore(value=maxthreads)
  82. threads = list()
  83. if __name__ == "__main__":
  84. config = common.get_config()
  85. producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
  86. common.insert_update_store(producer, {'store': config['fun']['name'], 'url': config['fun']['url'], 'image_url': config['fun']['logo']})
  87. common.clear_failed_product(config['fun']['name'])
  88. all_categories = get_categories(config['fun']['categories_url'])
  89. process_categories = all_categories.copy()
  90. url = next_url(process_categories)
  91. products = {}
  92. i = 0
  93. while url is not None:
  94. i = i + 1
  95. print(f'{i}/{len(all_categories)} - {len(process_categories)}')
  96. cat_url = config["fun"]["url"] + url
  97. soup = common.get_soup_page_no_limit(cat_url)
  98. subcat = soup.find_all("li", class_="item last")
  99. if len(subcat) == 0:
  100. big_urls = soup.find_all("big")
  101. for b_url in big_urls:
  102. b_href = b_url.find("a")
  103. if b_href:
  104. try:
  105. b_url = b_href['href'].replace(config["fun"]["url"], "")
  106. if b_url not in all_categories.values():
  107. process_categories[b_href.text] = b_url
  108. all_categories[b_href.text] = b_url
  109. except Exception as err:
  110. print(url)
  111. print("+++++++++++++")
  112. print(cat_url)
  113. print("+++++++++++++")
  114. print(b_href)
  115. print("+++++++++++++")
  116. print(err)
  117. print("=============")
  118. url = next_url(process_categories)
  119. else:
  120. for sc in subcat:
  121. product = sc.find("h2", class_="product-name")
  122. p_info = product.find("a")
  123. if p_info['href'] not in products.values():
  124. products[p_info['title']] = p_info['href']
  125. thread = threading.Thread(target=get_product_details,args=(p_info['href'], config, producer))
  126. threads.append(thread)
  127. thread.start()
  128. next_page = soup.find("a", class_="next i-next")
  129. if next_page is None:
  130. url = next_url(process_categories)
  131. else:
  132. url = next_page['href'].replace(config["fun"]["url"], "")
  133. for t in threads:
  134. t.join()
  135. common.update_store_prices(producer, config['fun'])