|
|
- import yaml
- import json
- import os
- import shutil
- import traceback
- import requests
- import re
-
- from bs4 import BeautifulSoup
- from ratelimit import limits, sleep_and_retry
- from time import sleep
- from random import randint
- from datetime import datetime
-
-
- def get_config():
- with open('cheap_shopper.yaml', 'r') as ymlfile:
- cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
-
- return cfg
-
-
- def add_product(kafka_producer, store, product, price):
- db_object = {}
- db_object['type'] = 'product'
- db_object['store'] = store
- db_object['product'] = product
- db_object['price'] = price
-
- db_object_json = json.dumps(db_object)
- db_object_bytes = bytearray(db_object_json, 'utf-8')
-
- send_kafka_message(kafka_producer, db_object_bytes)
-
-
- def insert_update_store(kafka_producer, store):
- db_object = {}
- db_object['type'] = 'store'
- db_object['store'] = store
-
- db_object_json = json.dumps(db_object)
- db_object_bytes = bytearray(db_object_json, 'utf-8')
-
- send_kafka_message(kafka_producer, db_object_bytes)
-
-
- def send_kafka_message(kafka_producer, message):
- kafka_producer.send('shopper_db', message)
-
-
- def update_store_prices(kafka_producer, config):
- insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')})
-
- db_object = {}
- db_object['type'] = 'store_update'
- db_object['store'] = config['name']
-
- db_object_json = json.dumps(db_object)
- db_object_bytes = bytearray(db_object_json, 'utf-8')
- send_kafka_message(kafka_producer, db_object_bytes)
- print(f'updating prices {db_object_json}')
-
-
- def clear_failed_product(store):
- if not os.path.exists('failed'):
- os.mkdir('failed')
- if not os.path.exists(f'failed/{store}'):
- os.mkdir(f'failed/{store}')
- return
- else:
- for dir in os.listdir(f'failed/{store}/'):
- shutil.rmtree(f'failed/{store}/{dir}')
-
-
- def dump_failed_product(store, prod_url, page, err, trace):
- if not os.path.exists('failed'):
- os.mkdir('failed')
- if not os.path.exists(f'failed/{store}'):
- os.mkdir(f'failed/{store}')
- dirname = prod_url.replace('https://','')
- dirname = dirname.replace('.','-')
- dirname = dirname.replace('/','_')
- os.mkdir(f'failed/{store}/{dirname}')
- err_file = open(f'failed/{store}/{dirname}/error.txt', "w")
- err_file.write(f'{prod_url}\n')
- err_file.write('===========================================\n')
- err_file.write(f'{str(err)}\n')
- err_file.write('===========================================\n')
- err_file.write(str(trace))
- err_file.close()
- page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8")
- page_file.write(str(page))
- page_file.close()
-
- def get_proxies():
- page = requests.get("https://free-proxy-list.net/")
- soup = BeautifulSoup(page.content, "html.parser")
- raw_div = soup.find("div", {"id": "raw"})
- raw_textarea = raw_div.find("textarea")
- ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text)
- ip_dict = {}
- for ip in ip_list:
- ip_addr = ip.split(":")[0]
- ip_port = ip.split(":")[1]
- ip_dict[ip_addr] = ip_port
- return ip_list
-
-
- @sleep_and_retry
- @limits(calls=2, period=1)
- def get_soup_page(url, cookie=None):
- #print(f'get page for soup: {url}')
- sleep(randint(1,2))
- soup = None
- try:
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'}
- #page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
- page = requests.get(url, cookies=cookie, headers=headers)
- if page.status_code != 200 and page.status_code != 301:
- return None
- soup = BeautifulSoup(page.content, "html.parser")
- except ConnectionRefusedError:
- print(traceback.format_exc())
- sleep(randint(2,3))
- soup = get_soup_page(url)
- except Exception as err:
- print(traceback.format_exc())
- print(err)
-
- return soup
-
- def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}):
- #print(f'get page for soup: {url}')
- sleep(randint(1,2))
- soup = None
- try:
- if len(headers) == 0:
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
- 'content-type':'text/html;charset=UTF-8'}
- if 'user-agent' not in headers:
- headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
- headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'
- #page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
- page = requests.get(url, cookies=cookie, data=payload, headers=headers)
- #print(page.content)
- if page.status_code != 200 and page.status_code != 301:
- return None
- soup = BeautifulSoup(page.content, "html.parser")
- except Exception as err:
- print(traceback.format_exc())
- print(err)
- return soup
|