diff --git a/Pipfile b/Pipfile index c08f897..9a61dbb 100644 --- a/Pipfile +++ b/Pipfile @@ -20,6 +20,7 @@ beautifulsoup4 = "*" "discord.py" = {extras = ["voice"],git = "https://github.com/Rapptz/discord.py"} "hurry.filesize" = "*" requests = "*" +html5lib = "*" [dev-packages] lxml = "*" diff --git a/src/utils/scraper.py b/src/utils/scraper.py index dae24f5..e9590a0 100644 --- a/src/utils/scraper.py +++ b/src/utils/scraper.py @@ -3,7 +3,7 @@ import ast import re from bs4 import BeautifulSoup -from lxml import html +import lxml from hurry.filesize import size, alternative from misc import exceptions as exc @@ -12,7 +12,7 @@ from utils import utils as u # async def get_harry(url): # content = await u.fetch('https://iqdb.harry.lu', params={'url': url}) -# soup = BeautifulSoup(content, 'html.parser') +# soup = BeautifulSoup(content, 'html5lib') # # if soup.find('div', id='show1').string is 'Not the right one? ': # parent = soup.find('th', string='Probable match:').parent.parent @@ -41,17 +41,25 @@ from utils import utils as u async def get_kheina(url): - content = await u.fetch('https://kheina.com', params={'url': url}) - soup = BeautifulSoup(content, 'html.parser') + content = await u.fetch('https://kheina.com', params={'url': url}, text=True) - results = ast.literal_eval(soup.find('data', id='results').string)[-1] - iqdbdata = ast.literal_eval(soup.find('data', id='iqdbdata').string)[0] + content = content.replace('"', 'quot;').replace(''', 'apos;') + soup = BeautifulSoup(content, 'html5lib') + results = soup.find('data', id='results').string.replace('quot;', '"').replace('apos;', ''') + results = ast.literal_eval(results) + iqdbdata = soup.find('data', id='iqdbdata').string + iqdbdata = ast.literal_eval(iqdbdata) + + for e in results: + if iqdbdata[0]['iqdbid'] in e: + match = e + break result = { - 'source': results[3], - 'artist': results[4], - 'thumbnail': f'https://f002.backblazeb2.com/file/kheinacom/{results[1]}.jpg', - 'similarity': str(int(float(iqdbdata['similarity']))), + 'source': match[3], + 'artist': match[4], + 'thumbnail': f'https://f002.backblazeb2.com/file/kheinacom/{match[1]}.jpg', + 'similarity': str(int(float(iqdbdata[0]['similarity']))), 'database': 'Kheina' } @@ -63,6 +71,7 @@ async def get_saucenao(url): 'https://saucenao.com/search.php', params={'url': url, 'api_key': u.config['saucenao_api'], 'output_type': 2}, json=True) + results = content['results'][0] for i in range(len(content['results'])): if 'e621' in content['results'][i]['header']['index_name']: @@ -108,7 +117,7 @@ async def get_post(url): async def get_image(url): content = await u.fetch(url) - value = html.fromstring(content).xpath( + value = lxml.html.fromstring(content).xpath( 'string(/html/body/div[@id="content"]/div[@id="post-view"]/div[@class="content"]/div[2]/img/@src)') return value diff --git a/src/utils/utils.py b/src/utils/utils.py index e907744..b86541e 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -85,14 +85,17 @@ color = d.Color(0x1A1A1A) last_commands = {} -async def fetch(url, *, params={}, json=False, response=False): +async def fetch(url, *, params={}, json=False, response=False, text=False): async with aiohttp.ClientSession() as session: async with session.get(url, params=params, headers={'User-Agent': 'Myned/Modufur'}, ssl=False) as r: - if response: - return r - elif json: + if json: return await r.json() - return await r.read() + elif response: + return r + elif text: + return await r.text() + else: + return await r.read() def generate_embed(ctx, *, title=d.Embed.Empty, kind='rich', description=d.Embed.Empty, url=d.Embed.Empty, timestamp=d.Embed.Empty, colour=color, footer={}, image=d.Embed.Empty, thumbnail=d.Embed.Empty, author={}, fields=[]):