|
import yaml
|
|
import json
|
|
import os
|
|
import shutil
|
|
import traceback
|
|
import requests
|
|
import re
|
|
|
|
from bs4 import BeautifulSoup
|
|
from ratelimit import limits, sleep_and_retry
|
|
from time import sleep
|
|
from random import randint
|
|
from datetime import datetime
|
|
|
|
|
|
def get_config():
|
|
with open('cheap_shopper.yaml', 'r') as ymlfile:
|
|
cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
|
|
|
|
return cfg
|
|
|
|
|
|
def add_product(kafka_producer, store, product, price):
|
|
db_object = {}
|
|
db_object['type'] = 'product'
|
|
db_object['store'] = store
|
|
db_object['product'] = product
|
|
db_object['price'] = price
|
|
|
|
db_object_json = json.dumps(db_object)
|
|
db_object_bytes = bytearray(db_object_json, 'utf-8')
|
|
|
|
send_kafka_message(kafka_producer, db_object_bytes)
|
|
|
|
|
|
def insert_update_store(kafka_producer, store):
|
|
db_object = {}
|
|
db_object['type'] = 'store'
|
|
db_object['store'] = store
|
|
|
|
db_object_json = json.dumps(db_object)
|
|
db_object_bytes = bytearray(db_object_json, 'utf-8')
|
|
|
|
send_kafka_message(kafka_producer, db_object_bytes)
|
|
|
|
|
|
def send_kafka_message(kafka_producer, message):
|
|
kafka_producer.send('shopper_db', message)
|
|
|
|
|
|
def update_store_prices(kafka_producer, config):
|
|
insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')})
|
|
|
|
db_object = {}
|
|
db_object['type'] = 'store_update'
|
|
db_object['store'] = config['name']
|
|
|
|
db_object_json = json.dumps(db_object)
|
|
db_object_bytes = bytearray(db_object_json, 'utf-8')
|
|
send_kafka_message(kafka_producer, db_object_bytes)
|
|
print(f'updating prices {db_object_json}')
|
|
|
|
|
|
def clear_failed_product(store):
|
|
if not os.path.exists('failed'):
|
|
os.mkdir('failed')
|
|
if not os.path.exists(f'failed/{store}'):
|
|
os.mkdir(f'failed/{store}')
|
|
return
|
|
else:
|
|
for dir in os.listdir(f'failed/{store}/'):
|
|
shutil.rmtree(f'failed/{store}/{dir}')
|
|
|
|
|
|
def dump_failed_product(store, prod_url, page, err, trace):
|
|
if not os.path.exists('failed'):
|
|
os.mkdir('failed')
|
|
if not os.path.exists(f'failed/{store}'):
|
|
os.mkdir(f'failed/{store}')
|
|
dirname = prod_url.replace('https://','')
|
|
dirname = dirname.replace('.','-')
|
|
dirname = dirname.replace('/','_')
|
|
os.mkdir(f'failed/{store}/{dirname}')
|
|
err_file = open(f'failed/{store}/{dirname}/error.txt', "w")
|
|
err_file.write(f'{prod_url}\n')
|
|
err_file.write('===========================================\n')
|
|
err_file.write(f'{str(err)}\n')
|
|
err_file.write('===========================================\n')
|
|
err_file.write(str(trace))
|
|
err_file.close()
|
|
page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8")
|
|
page_file.write(str(page))
|
|
page_file.close()
|
|
|
|
def get_proxies():
|
|
page = requests.get("https://free-proxy-list.net/")
|
|
soup = BeautifulSoup(page.content, "html.parser")
|
|
raw_div = soup.find("div", {"id": "raw"})
|
|
raw_textarea = raw_div.find("textarea")
|
|
ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text)
|
|
ip_dict = {}
|
|
for ip in ip_list:
|
|
ip_addr = ip.split(":")[0]
|
|
ip_port = ip.split(":")[1]
|
|
ip_dict[ip_addr] = ip_port
|
|
return ip_list
|
|
|
|
|
|
@sleep_and_retry
|
|
@limits(calls=2, period=1)
|
|
def get_soup_page(url, cookie=None):
|
|
#print(f'get page for soup: {url}')
|
|
sleep(randint(1,2))
|
|
soup = None
|
|
try:
|
|
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'}
|
|
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
|
|
page = requests.get(url, cookies=cookie, headers=headers)
|
|
if page.status_code != 200 and page.status_code != 301:
|
|
return None
|
|
soup = BeautifulSoup(page.content, "html.parser")
|
|
except ConnectionRefusedError:
|
|
print(traceback.format_exc())
|
|
sleep(randint(2,3))
|
|
soup = get_soup_page(url)
|
|
except Exception as err:
|
|
print(traceback.format_exc())
|
|
print(err)
|
|
|
|
return soup
|
|
|
|
def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}):
|
|
#print(f'get page for soup: {url}')
|
|
sleep(randint(1,2))
|
|
soup = None
|
|
try:
|
|
if len(headers) == 0:
|
|
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
|
|
'content-type':'text/html;charset=UTF-8'}
|
|
if 'user-agent' not in headers:
|
|
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
|
|
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'
|
|
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
|
|
page = requests.get(url, cookies=cookie, data=payload, headers=headers)
|
|
#print(page.content)
|
|
if page.status_code != 200 and page.status_code != 301:
|
|
return None
|
|
soup = BeautifulSoup(page.content, "html.parser")
|
|
except Exception as err:
|
|
print(traceback.format_exc())
|
|
print(err)
|
|
return soup
|