You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
3.1 KiB

3 years ago
  1. #!/usr/bin/env python3
  2. import requests
  3. import logging
  4. import xml.etree.ElementTree as ET
  5. import json
  6. import unidecode
  7. import sqlite3
  8. import common
  9. import threading
  10. import re
  11. import traceback
  12. import sys
  13. from bs4 import BeautifulSoup
  14. from datetime import datetime
  15. from kafka import KafkaProducer
  16. def get_product_details(prod_url, config, kafka_producer, json_product):
  17. try:
  18. soup = common.get_soup_page_no_limit(prod_url)
  19. description = soup.find('div', class_='pdp-description__content')
  20. ean_row = soup.find_all('div', class_='row border-bottom')
  21. product = {}
  22. price = {}
  23. product['title'] = json_product['title']
  24. product['url'] = prod_url
  25. if 'image' in json_product:
  26. product['image_url'] = json_product['image']
  27. if description.find("p") is not None:
  28. product['info'] = description.find("p").text
  29. product['sku_code'] = json_product['sku']
  30. for row in ean_row:
  31. if row.text.find('EAN- / barcode') > -1:
  32. ean_pattern = "([0-9]{8,13})"
  33. ean_match = re.search(ean_pattern, row.text)
  34. product['ean_code'] = ean_match.group(1)
  35. if 'brand' in json_product:
  36. product['brand'] = json_product['brand']
  37. price['price'] = json_product['price']
  38. if 'discount' in json_product:
  39. price['promo'] = 1
  40. else:
  41. price['promo'] = 0
  42. common.add_product(kafka_producer, config['hubo']['name'], product, price)
  43. except Exception as err:
  44. common.dump_failed_product(config['hubo']['name'], prod_url, soup, err, traceback.format_exc())
  45. def next_url(url_list):
  46. if len(url_list) == 0:
  47. return None
  48. key = next(iter(url_list))
  49. url = url_list[key]
  50. del url_list[key]
  51. return url
  52. maxthreads = 5
  53. sema = threading.Semaphore(value=maxthreads)
  54. threads = list()
  55. if __name__ == "__main__" :
  56. config = common.get_config()
  57. producer = KafkaProducer(bootstrap_servers=[config['config']['kafka_boostrap_servers']])
  58. common.insert_update_store(producer, {'store': config['hubo']['name'], 'url': config['hubo']['url'], 'image_url': config['hubo']['logo']})
  59. common.clear_failed_product(config['hubo']['name'])
  60. get_doc = requests.get(config['hubo']['products_url'])
  61. data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
  62. offset = 1
  63. while len(data_json['docs']) > 0:
  64. print(f'{offset}/{data_json["doc_count"]}')
  65. for product in data_json['docs']:
  66. thread = threading.Thread(target=get_product_details,args=(f'{config["hubo"]["url"]}{product["url"]}', config, producer, product))
  67. threads.append(thread)
  68. thread.start()
  69. offset = offset + int(data_json['limit'])
  70. get_doc = requests.get(config['hubo']['products_url_offset'].replace('__OFFSET__', str(offset)))
  71. data_json = json.loads(unidecode.unidecode(get_doc.content.decode('utf-8', 'ignore')))
  72. for t in threads:
  73. t.join()
  74. common.update_store_prices(producer, config['hubo'])