You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

152 lines
5.3 KiB

3 years ago
  1. import yaml
  2. import json
  3. import os
  4. import shutil
  5. import traceback
  6. import requests
  7. import re
  8. from bs4 import BeautifulSoup
  9. from ratelimit import limits, sleep_and_retry
  10. from time import sleep
  11. from random import randint
  12. from datetime import datetime
  13. def get_config():
  14. with open('cheap_shopper.yaml', 'r') as ymlfile:
  15. cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
  16. return cfg
  17. def add_product(kafka_producer, store, product, price):
  18. db_object = {}
  19. db_object['type'] = 'product'
  20. db_object['store'] = store
  21. db_object['product'] = product
  22. db_object['price'] = price
  23. db_object_json = json.dumps(db_object)
  24. db_object_bytes = bytearray(db_object_json, 'utf-8')
  25. send_kafka_message(kafka_producer, db_object_bytes)
  26. def insert_update_store(kafka_producer, store):
  27. db_object = {}
  28. db_object['type'] = 'store'
  29. db_object['store'] = store
  30. db_object_json = json.dumps(db_object)
  31. db_object_bytes = bytearray(db_object_json, 'utf-8')
  32. send_kafka_message(kafka_producer, db_object_bytes)
  33. def send_kafka_message(kafka_producer, message):
  34. kafka_producer.send('shopper_db', message)
  35. def update_store_prices(kafka_producer, config):
  36. insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')})
  37. db_object = {}
  38. db_object['type'] = 'store_update'
  39. db_object['store'] = config['name']
  40. db_object_json = json.dumps(db_object)
  41. db_object_bytes = bytearray(db_object_json, 'utf-8')
  42. send_kafka_message(kafka_producer, db_object_bytes)
  43. print(f'updating prices {db_object_json}')
  44. def clear_failed_product(store):
  45. if not os.path.exists('failed'):
  46. os.mkdir('failed')
  47. if not os.path.exists(f'failed/{store}'):
  48. os.mkdir(f'failed/{store}')
  49. return
  50. else:
  51. for dir in os.listdir(f'failed/{store}/'):
  52. shutil.rmtree(f'failed/{store}/{dir}')
  53. def dump_failed_product(store, prod_url, page, err, trace):
  54. if not os.path.exists('failed'):
  55. os.mkdir('failed')
  56. if not os.path.exists(f'failed/{store}'):
  57. os.mkdir(f'failed/{store}')
  58. dirname = prod_url.replace('https://','')
  59. dirname = dirname.replace('.','-')
  60. dirname = dirname.replace('/','_')
  61. os.mkdir(f'failed/{store}/{dirname}')
  62. err_file = open(f'failed/{store}/{dirname}/error.txt', "w")
  63. err_file.write(f'{prod_url}\n')
  64. err_file.write('===========================================\n')
  65. err_file.write(f'{str(err)}\n')
  66. err_file.write('===========================================\n')
  67. err_file.write(str(trace))
  68. err_file.close()
  69. page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8")
  70. page_file.write(str(page))
  71. page_file.close()
  72. def get_proxies():
  73. page = requests.get("https://free-proxy-list.net/")
  74. soup = BeautifulSoup(page.content, "html.parser")
  75. raw_div = soup.find("div", {"id": "raw"})
  76. raw_textarea = raw_div.find("textarea")
  77. ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text)
  78. ip_dict = {}
  79. for ip in ip_list:
  80. ip_addr = ip.split(":")[0]
  81. ip_port = ip.split(":")[1]
  82. ip_dict[ip_addr] = ip_port
  83. return ip_list
  84. @sleep_and_retry
  85. @limits(calls=2, period=1)
  86. def get_soup_page(url, cookie=None):
  87. #print(f'get page for soup: {url}')
  88. sleep(randint(1,2))
  89. soup = None
  90. try:
  91. headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'}
  92. #page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
  93. page = requests.get(url, cookies=cookie, headers=headers)
  94. if page.status_code != 200 and page.status_code != 301:
  95. return None
  96. soup = BeautifulSoup(page.content, "html.parser")
  97. except ConnectionRefusedError:
  98. print(traceback.format_exc())
  99. sleep(randint(2,3))
  100. soup = get_soup_page(url)
  101. except Exception as err:
  102. print(traceback.format_exc())
  103. print(err)
  104. return soup
  105. def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}):
  106. #print(f'get page for soup: {url}')
  107. sleep(randint(1,2))
  108. soup = None
  109. try:
  110. if len(headers) == 0:
  111. headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
  112. 'content-type':'text/html;charset=UTF-8'}
  113. if 'user-agent' not in headers:
  114. headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
  115. headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'
  116. #page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
  117. page = requests.get(url, cookies=cookie, data=payload, headers=headers)
  118. #print(page.content)
  119. if page.status_code != 200 and page.status_code != 301:
  120. return None
  121. soup = BeautifulSoup(page.content, "html.parser")
  122. except Exception as err:
  123. print(traceback.format_exc())
  124. print(err)
  125. return soup