You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 lines
5.9 KiB

3 years ago
  1. #!/usr/bin/env python3
  2. import requests
  3. import logging
  4. import xml.etree.ElementTree as ET
  5. import json
  6. import unidecode
  7. import sqlite3
  8. import common
  9. import threading
  10. import re
  11. import traceback
  12. from bs4 import BeautifulSoup
  13. from datetime import datetime
  14. from kafka import KafkaProducer
  15. def get_categories(url, cookie):
  16. soup = common.get_soup_page_no_limit(url, cookie)
  17. categories = {}
  18. dropdown = soup.find('div', class_="mxd-mega-dropdown")
  19. anchors = dropdown.find_all('a')
  20. for anchor in anchors:
  21. if anchor['href'].find('bricosolar') >= 0:
  22. print(f'remove {anchor["href"]}')
  23. pass
  24. elif anchor['href'] not in categories.values():
  25. if anchor['href'].find('https://www.brico.be') == -1:
  26. anchor['href'] = f'https://www.brico.be{anchor["href"]}'
  27. categories[anchor.text] = anchor['href']
  28. return categories
  29. def get_product_details(prod_url, config, kafka_producer, failed):
  30. try:
  31. soup = common.get_soup_page_no_limit(prod_url, config['brico']['cookie'])
  32. prod_detail = soup.find('div', class_='mxd-container mxd-product-detail-layout')
  33. if prod_detail is None:
  34. return
  35. prod_specs = soup.find('div', {'id': 'specs'})
  36. title = prod_detail.find('h1', class_='mxd-product-detail-layout-heading mxd-h1')
  37. product = {}
  38. price = {}
  39. product['title'] = title.find("span", {'itemprop': 'name'}).text
  40. product['url'] = prod_url
  41. product['image_url'] = prod_detail.find("img", class_="mxd-fixed-ratio-image")['src']
  42. product['info'] = prod_specs.find_all("p")[0].text
  43. product['sku_code'] = prod_url.split("/")[-1]
  44. ean_pattern = "([0-9]{8,13})"
  45. if len(prod_specs.find_all("p")) > 1:
  46. ean_match = re.search(ean_pattern, prod_specs.find_all("p")[1].text)
  47. product['ean_code'] = ean_match.group(1)
  48. pattern_brand = 'brands\"\:\[\{\"code\"\:\"[\w\s]+\",\"name\"\:\"([\w\s]+)'
  49. pattern_ean = '\"ean\"\:\"([0-9]{8,13})\"'
  50. scripts = soup.find_all("script")
  51. for script in scripts:
  52. if script.string is not None:
  53. if script.string.find('"brands"') >= 0:
  54. match = re.search(pattern_brand, script.string)
  55. product['brand'] = match.group(1)
  56. if script.string.find('"ean"') >= 0 and 'ean_code' not in product:
  57. match = re.search(pattern_ean, script.string)
  58. product['ean_code'] = match.group(1)
  59. if soup.find("ins") is None:
  60. return
  61. price['price'] = soup.find("ins").find("meta")['content']
  62. price['promo'] = 0 if soup.find("del") is None else 1
  63. common.add_product(kafka_producer, config['brico']['name'], product, price)
  64. except Exception as err:
  65. common.dump_failed_product(config['brico']['name'], prod_url, soup, err, traceback.format_exc())
  66. def next_url(url_list):
  67. if len(url_list) == 0:
  68. return None
  69. key = next(iter(url_list))
  70. url = url_list[key]
  71. del url_list[key]
  72. return url
  73. maxthreads = 10
  74. sema = threading.Semaphore(value=maxthreads)
  75. threads = list()
  76. if __name__ == "__main__" :
  77. config = common.get_config()
  78. producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
  79. common.insert_update_store(producer, {'store': config['brico']['name'], 'url': config['brico']['url'], 'image_url': config['brico']['logo']})
  80. common.clear_failed_product(config['brico']['name'])
  81. all_categories = get_categories(config['brico']['url'], config['brico']['cookie'])
  82. process_categories = all_categories.copy()
  83. url = next_url(process_categories)
  84. products = {}
  85. failed_prod = {}
  86. i = 0
  87. while url is not None:
  88. i = i + 1
  89. print(f'{i}/{len(all_categories)} - {len(process_categories)}')
  90. soup = common.get_soup_page_no_limit(url, config['brico']['cookie'])
  91. page_products = soup.find_all("div", class_="mxd-block-card mxd-block-card-responsive")
  92. extra_cat = soup.find_all("a", class_="mxd-product-list-link")
  93. if len(extra_cat) > 0:
  94. for cat in extra_cat:
  95. if cat['href'].find('bricosolar') >= 0:
  96. print(f'main remove {cat["href"]}')
  97. pass
  98. elif cat['href'] not in all_categories.values():
  99. cat_name = cat.find("span").text
  100. if cat['href'].find("/") > 0:
  101. cat['href'] = f'/{cat["href"]}'
  102. process_categories[cat_name] = f'{config["brico"]["url"]}{cat["href"]}'
  103. #print(f'added {config["brico"]["url"]}{cat["href"]}')
  104. all_categories[cat_name] = cat['href']
  105. for product in page_products:
  106. product = product.find("a", class_="mxd-block-card-link")
  107. if product['href'].find("/nl") == -1:
  108. product['href'] = f'/nl{product["href"]}'
  109. if product['href'].find("/") > 0:
  110. product['href'] = f'/{product["href"]}'
  111. if product['href'] not in products.values():
  112. prod_name = product.find("span", {"itemprop": "name"}).text
  113. #print(f'product url: {product["href"]}')
  114. products[prod_name] = product["href"]
  115. thread = threading.Thread(target=get_product_details,args=(f'{config["brico"]["url"]}{product["href"]}', config, producer, failed_prod))
  116. threads.append(thread)
  117. thread.start()
  118. next_page = soup.find("a", {"rel": "next"})
  119. if next_page is None:
  120. url = next_url(process_categories)
  121. else:
  122. base = soup.find("link", {"rel": "alternate", "hreflang": "nl-be"})
  123. url = f'{base["href"]}{next_page["href"]}'
  124. print(f'next page: {url}')
  125. for t in threads:
  126. t.join()
  127. common.update_store_prices(producer, config['brico'])