You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

152 lines
5.3 KiB

import yaml
import json
import os
import shutil
import traceback
import requests
import re
from bs4 import BeautifulSoup
from ratelimit import limits, sleep_and_retry
from time import sleep
from random import randint
from datetime import datetime
def get_config():
with open('cheap_shopper.yaml', 'r') as ymlfile:
cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
return cfg
def add_product(kafka_producer, store, product, price):
db_object = {}
db_object['type'] = 'product'
db_object['store'] = store
db_object['product'] = product
db_object['price'] = price
db_object_json = json.dumps(db_object)
db_object_bytes = bytearray(db_object_json, 'utf-8')
send_kafka_message(kafka_producer, db_object_bytes)
def insert_update_store(kafka_producer, store):
db_object = {}
db_object['type'] = 'store'
db_object['store'] = store
db_object_json = json.dumps(db_object)
db_object_bytes = bytearray(db_object_json, 'utf-8')
send_kafka_message(kafka_producer, db_object_bytes)
def send_kafka_message(kafka_producer, message):
kafka_producer.send('shopper_db', message)
def update_store_prices(kafka_producer, config):
insert_update_store(kafka_producer, {'store': config['name'], 'url': config['url'], 'image_url': config['logo'], 'last_update': datetime.now().strftime('%d/%m/%Y')})
db_object = {}
db_object['type'] = 'store_update'
db_object['store'] = config['name']
db_object_json = json.dumps(db_object)
db_object_bytes = bytearray(db_object_json, 'utf-8')
send_kafka_message(kafka_producer, db_object_bytes)
print(f'updating prices {db_object_json}')
def clear_failed_product(store):
if not os.path.exists('failed'):
os.mkdir('failed')
if not os.path.exists(f'failed/{store}'):
os.mkdir(f'failed/{store}')
return
else:
for dir in os.listdir(f'failed/{store}/'):
shutil.rmtree(f'failed/{store}/{dir}')
def dump_failed_product(store, prod_url, page, err, trace):
if not os.path.exists('failed'):
os.mkdir('failed')
if not os.path.exists(f'failed/{store}'):
os.mkdir(f'failed/{store}')
dirname = prod_url.replace('https://','')
dirname = dirname.replace('.','-')
dirname = dirname.replace('/','_')
os.mkdir(f'failed/{store}/{dirname}')
err_file = open(f'failed/{store}/{dirname}/error.txt', "w")
err_file.write(f'{prod_url}\n')
err_file.write('===========================================\n')
err_file.write(f'{str(err)}\n')
err_file.write('===========================================\n')
err_file.write(str(trace))
err_file.close()
page_file = open(f'failed/{store}/{dirname}/page.html', "w", encoding="utf-8")
page_file.write(str(page))
page_file.close()
def get_proxies():
page = requests.get("https://free-proxy-list.net/")
soup = BeautifulSoup(page.content, "html.parser")
raw_div = soup.find("div", {"id": "raw"})
raw_textarea = raw_div.find("textarea")
ip_list = re.findall("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\.\:[0-9]+)", raw_textarea.text)
ip_dict = {}
for ip in ip_list:
ip_addr = ip.split(":")[0]
ip_port = ip.split(":")[1]
ip_dict[ip_addr] = ip_port
return ip_list
@sleep_and_retry
@limits(calls=2, period=1)
def get_soup_page(url, cookie=None):
#print(f'get page for soup: {url}')
sleep(randint(1,2))
soup = None
try:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'content-type':'text/html;charset=UTF-8'}
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
page = requests.get(url, cookies=cookie, headers=headers)
if page.status_code != 200 and page.status_code != 301:
return None
soup = BeautifulSoup(page.content, "html.parser")
except ConnectionRefusedError:
print(traceback.format_exc())
sleep(randint(2,3))
soup = get_soup_page(url)
except Exception as err:
print(traceback.format_exc())
print(err)
return soup
def get_soup_page_no_limit(url, cookie=None, payload=None, headers={}):
#print(f'get page for soup: {url}')
sleep(randint(1,2))
soup = None
try:
if len(headers) == 0:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
'content-type':'text/html;charset=UTF-8'}
if 'user-agent' not in headers:
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'
#page = requests.get(url, proxies={"http": next(proxy_pool), "https": next(proxy_pool)})
page = requests.get(url, cookies=cookie, data=payload, headers=headers)
#print(page.content)
if page.status_code != 200 and page.status_code != 301:
return None
soup = BeautifulSoup(page.content, "html.parser")
except Exception as err:
print(traceback.format_exc())
print(err)
return soup