import yaml import json import os import shutil import traceback import requests import re from bs4 import BeautifulSoup from ratelimit import limits, sleep_and_retry from time import sleep from random import randint from datetime import datetime def get_config(): with open('cheap_shopper.yaml', 'r') as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) return cfg def add_product(kafka_producer, store, product, price): db_object = {} db_object['type'] = 'product' db_object['store'] = store db_object['product'] = product db_object['price'] = price db_object_json = json.dumps(db_object) db_object_bytes = bytearray(db_object_json, 'utf-8') send_kafka_message(kafka_producer, db_object_bytes) def insert_update_store(kafka_producer, store): db_object = {} db_object['type'] = 'store' db_object['store'] = store db_object_json = json.dumps(db_object) db_object_bytes = bytearray(db_object_json, 'utf-8') send_kafka_message(kafka_producer, db_object_bytes) def send_kafka_message(kafka_producer, message): kafka_producer.send('shopper_db', message) def update_store_prices(kafka_producer, config): insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')}) db_object = {} db_object['type'] = 'store_update' db_object['store'] = config['name'] db_object_json = json.dumps(db_object) db_object_bytes = bytearray(db_object_json, 'utf-8') send_kafka_message(kafka_producer, db_object_bytes) print(f'updating prices {db_object_json}') def clear_failed_product(store): if not os.path.exists('failed'): os.mkdir('failed') if not os.path.exists(f'failed/{store}'): os.mkdir(f'failed/{store}') return else: for dir in os.listdir(f'failed/{store}/'): shutil.rmtree(f'failed/{store}/{dir}') def dump_failed_product(store, prod_url, page, err, trace): if not os.path.exists('failed'): os.mkdir('failed') if not os.path.exists(f'failed/{store}'): os.mkdir(f'failed/{store}') dirname = prod_url.replace('https://','') dirname = dirname.replace('.','-') dirname = dirname.replace('/','_') os.mkdir(f'failed/{store}/{dirname}') err_file = open(f'failed/{store}/{dirname}/error.txt', "w") err_file.write(f'{prod_url}\n') err_file.write('===========================================\n') err_file.write(f'{str(err)}\n') err_file.write('===========================================\n') err_file.write(str(trace)) err_file.close() page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8") page_file.write(str(page)) page_file.close() def get_proxies(): page = requests.get("https://free-proxy-list.net/") soup = BeautifulSoup(page.content, "html.parser") raw_div = soup.find("div", {"id": "raw"}) raw_textarea = raw_div.find("textarea") ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text) ip_dict = {} for ip in ip_list: ip_addr = ip.split(":")[0] ip_port = ip.split(":")[1] ip_dict[ip_addr] = ip_port return ip_list @sleep_and_retry @limits(calls=2, period=1) def get_soup_page(url, cookie=None): #print(f'get page for soup: {url}') sleep(randint(1,2)) soup = None try: headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'} #page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)}) page = requests.get(url, cookies=cookie, headers=headers) if page.status_code != 200 and page.status_code != 301: return None soup = BeautifulSoup(page.content, "html.parser") except ConnectionRefusedError: print(traceback.format_exc()) sleep(randint(2,3)) soup = get_soup_page(url) except Exception as err: print(traceback.format_exc()) print(err) return soup def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}): #print(f'get page for soup: {url}') sleep(randint(1,2)) soup = None try: if len(headers) == 0: headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'} if 'user-agent' not in headers: headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51' #page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)}) page = requests.get(url, cookies=cookie, data=payload, headers=headers) #print(page.content) if page.status_code != 200 and page.status_code != 301: return None soup = BeautifulSoup(page.content, "html.parser") except Exception as err: print(traceback.format_exc()) print(err) return soup