From 9ab357e26f28eeb905c1475254e470de1b93c037 Mon Sep 17 00:00:00 2001 From: Myned Date: Tue, 24 Sep 2019 19:10:56 -0400 Subject: [PATCH 1/4] Add html5lib requirement as bs4 parser --- Pipfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Pipfile b/Pipfile index c08f897..9a61dbb 100644 --- a/Pipfile +++ b/Pipfile @@ -20,6 +20,7 @@ beautifulsoup4 = "*" "discord.py" = {extras = ["voice"],git = "https://github.com/Rapptz/discord.py"} "hurry.filesize" = "*" requests = "*" +html5lib = "*" [dev-packages] lxml = "*" From 3741b0e694101efe3ab7fd67370977369f0364a0 Mon Sep 17 00:00:00 2001 From: Myned Date: Tue, 24 Sep 2019 19:11:29 -0400 Subject: [PATCH 2/4] Add text argument to fetch util --- src/utils/utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/utils/utils.py b/src/utils/utils.py index e907744..b86541e 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -85,14 +85,17 @@ color = d.Color(0x1A1A1A) last_commands = {} -async def fetch(url, *, params={}, json=False, response=False): +async def fetch(url, *, params={}, json=False, response=False, text=False): async with aiohttp.ClientSession() as session: async with session.get(url, params=params, headers={'User-Agent': 'Myned/Modufur'}, ssl=False) as r: - if response: - return r - elif json: + if json: return await r.json() - return await r.read() + elif response: + return r + elif text: + return await r.text() + else: + return await r.read() def generate_embed(ctx, *, title=d.Embed.Empty, kind='rich', description=d.Embed.Empty, url=d.Embed.Empty, timestamp=d.Embed.Empty, colour=color, footer={}, image=d.Embed.Empty, thumbnail=d.Embed.Empty, author={}, fields=[]): From 1262dc2ba78b4cef15749d3bed46ae3083e59b0f Mon Sep 17 00:00:00 2001 From: Myned Date: Tue, 24 Sep 2019 19:12:31 -0400 Subject: [PATCH 3/4] Change package reference to avoid clashes --- src/utils/scraper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/scraper.py b/src/utils/scraper.py index dae24f5..1ad5b78 100644 --- a/src/utils/scraper.py +++ b/src/utils/scraper.py @@ -3,7 +3,7 @@ import ast import re from bs4 import BeautifulSoup -from lxml import html +import lxml from hurry.filesize import size, alternative from misc import exceptions as exc @@ -108,7 +108,7 @@ async def get_post(url): async def get_image(url): content = await u.fetch(url) - value = html.fromstring(content).xpath( + value = lxml.html.fromstring(content).xpath( 'string(/html/body/div[@id="content"]/div[@id="post-view"]/div[@class="content"]/div[2]/img/@src)') return value From 5987b4692535af69f8891cb919fbd26a6aa7f28f Mon Sep 17 00:00:00 2001 From: Myned Date: Tue, 24 Sep 2019 19:13:10 -0400 Subject: [PATCH 4/4] Fix Kheina parsing and eval --- src/utils/scraper.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/utils/scraper.py b/src/utils/scraper.py index 1ad5b78..e9590a0 100644 --- a/src/utils/scraper.py +++ b/src/utils/scraper.py @@ -12,7 +12,7 @@ from utils import utils as u # async def get_harry(url): # content = await u.fetch('https://iqdb.harry.lu', params={'url': url}) -# soup = BeautifulSoup(content, 'html.parser') +# soup = BeautifulSoup(content, 'html5lib') # # if soup.find('div', id='show1').string is 'Not the right one? ': # parent = soup.find('th', string='Probable match:').parent.parent @@ -41,17 +41,25 @@ from utils import utils as u async def get_kheina(url): - content = await u.fetch('https://kheina.com', params={'url': url}) - soup = BeautifulSoup(content, 'html.parser') + content = await u.fetch('https://kheina.com', params={'url': url}, text=True) - results = ast.literal_eval(soup.find('data', id='results').string)[-1] - iqdbdata = ast.literal_eval(soup.find('data', id='iqdbdata').string)[0] + content = content.replace('"', 'quot;').replace(''', 'apos;') + soup = BeautifulSoup(content, 'html5lib') + results = soup.find('data', id='results').string.replace('quot;', '"').replace('apos;', ''') + results = ast.literal_eval(results) + iqdbdata = soup.find('data', id='iqdbdata').string + iqdbdata = ast.literal_eval(iqdbdata) + + for e in results: + if iqdbdata[0]['iqdbid'] in e: + match = e + break result = { - 'source': results[3], - 'artist': results[4], - 'thumbnail': f'https://f002.backblazeb2.com/file/kheinacom/{results[1]}.jpg', - 'similarity': str(int(float(iqdbdata['similarity']))), + 'source': match[3], + 'artist': match[4], + 'thumbnail': f'https://f002.backblazeb2.com/file/kheinacom/{match[1]}.jpg', + 'similarity': str(int(float(iqdbdata[0]['similarity']))), 'database': 'Kheina' } @@ -63,6 +71,7 @@ async def get_saucenao(url): 'https://saucenao.com/search.php', params={'url': url, 'api_key': u.config['saucenao_api'], 'output_type': 2}, json=True) + results = content['results'][0] for i in range(len(content['results'])): if 'e621' in content['results'][i]['header']['index_name']: