modufur/src/utils/scraper.py

import aiohttp
import ast
import re

from bs4 import BeautifulSoup
import lxml
from hurry.filesize import size, alternative

from misc import exceptions as exc
from utils import utils as u


# async def get_harry(url):
#     content = await u.fetch('https://iqdb.harry.lu', params={'url': url})
#     soup = BeautifulSoup(content, 'html5lib')
#
#     if soup.find('div', id='show1').string is 'Not the right one? ':
#         parent = soup.find('th', string='Probable match:').parent.parent
#
#         post = await u.fetch(
#             'https://e621.net/post/show.json',
#             params={'id': re.search('show/([0-9]+)', parent.tr.td.a.get('href')).group(1)},
#             json=True)
#         if (post['status'] == 'deleted'):
#             post = await u.fetch(
#                 'https://e621.net/post/show.json',
#                 params={'id': re.search('#(\\d+)', post['delreason']).group(1)},
#                 json=True)
#
#         result = {
#             'source': f'https://e621.net/post/show/{post["id"]}',
#             'artist': ', '.join(post['artist']),
#             'thumbnail': parent.td.a.img.get('src'),
#             'similarity': re.search('\\d+', parent.tr[4].td.string).group(0),
#             'database': 'Harry.lu'
#             }
#
#         return result
#     else:
#         return False


async def query_kheina(url):
    content = await u.fetch('https://kheina.com', params={'url': url}, text=True)

    content = content.replace('&quot;', 'quot;').replace('&apos;', 'apos;')
    soup = BeautifulSoup(content, 'html5lib')
    results = soup.find('data', id='results').string.replace('quot;', '&quot;').replace('apos;', '&apos;')
    results = ast.literal_eval(results)
    iqdbdata = soup.find('data', id='iqdbdata').string
    iqdbdata = ast.literal_eval(iqdbdata)

    similarity = int(float(iqdbdata[0]['similarity']))
    if similarity < 55:
        return False

    for e in results:
        if iqdbdata[0]['iqdbid'] in e:
            match = e
            break

    result = {
        'source': match[3],
        'artist': match[4],
        'thumbnail': f'https://f002.backblazeb2.com/file/kheinacom/{match[1]}.jpg',
        'similarity': str(similarity),
        'database': 'Kheina'
    }

    return result


async def query_saucenao(url):
    content = await u.fetch(
        'https://saucenao.com/search.php',
        params={'url': url, 'api_key': u.config['saucenao_api'], 'output_type': 2},
        json=True)

    match = content['results'][0]

    similarity = int(float(match['header']['similarity']))
    if similarity < 55:
        return False
        artist = 'member_name'
    elif 'creator' in match['data']:
        artist = 'creator'
    else:
        artist = 'imdb_id'

    result = {
        'source': match['data']['ext_urls'][0],
        'artist': match['data'][artist],
        'thumbnail': match['header']['thumbnail'],
        'similarity': str(similarity),
        'database': 'SauceNAO'
        }

    return result


async def get_post(url):
    try:
        content = await u.fetch(url, response=True)
        filesize = int(content.headers['Content-Length'])
        if filesize > 8192 * 1024:
            raise exc.SizeError(size(filesize, system=alternative))

        result = await query_kheina(url)
        if not result:
            result = await query_saucenao(url)
        if not result:
            raise exc.MatchError(re.search('\\/([^\\/]+)$', url).group(1))

        return result

    except aiohttp.InvalidURL:
        raise exc.MissingArgument


async def get_image(url):
    content = await u.fetch(url)

    value = lxml.html.fromstring(content).xpath(
        'string(/html/body/div[@id="content"]/div[@id="post-view"]/div[@class="content"]/div[2]/img/@src)')

    return value
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`import aiohttp`
			`import ast`
Caught argument errors and rxed the exception for output 2017-10-15 15:32:35 -04:00			`import re`

Initial commit 2017-09-24 11:05:28 -04:00			`from bs4 import BeautifulSoup`
Change package reference to avoid clashes 2019-09-24 19:12:31 -04:00			`import lxml`
Added check for filesize before requesting from iqdb 2017-11-20 02:12:56 -05:00			`from hurry.filesize import size, alternative`
Removed redundant scraper methods, autopep8 2017-10-12 22:30:40 -04:00
Initial commit 2017-09-24 11:05:28 -04:00			`from misc import exceptions as exc`
Removed redundant scraper methods, autopep8 2017-10-12 22:30:40 -04:00			`from utils import utils as u`

Initial commit 2017-09-24 11:05:28 -04:00
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`# async def get_harry(url):`
			`# content = await u.fetch('https://iqdb.harry.lu', params={'url': url})`
Fix Kheina parsing and eval 2019-09-24 19:13:10 -04:00			`# soup = BeautifulSoup(content, 'html5lib')`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`#`
			`# if soup.find('div', id='show1').string is 'Not the right one? ':`
			`# parent = soup.find('th', string='Probable match:').parent.parent`
			`#`
			`# post = await u.fetch(`
			`# 'https://e621.net/post/show.json',`
			`# params={'id': re.search('show/([0-9]+)', parent.tr.td.a.get('href')).group(1)},`
			`# json=True)`
			`# if (post['status'] == 'deleted'):`
			`# post = await u.fetch(`
			`# 'https://e621.net/post/show.json',`
			`# params={'id': re.search('#(\\d+)', post['delreason']).group(1)},`
			`# json=True)`
			`#`
			`# result = {`
			`# 'source': f'https://e621.net/post/show/{post["id"]}',`
			`# 'artist': ', '.join(post['artist']),`
			`# 'thumbnail': parent.td.a.img.get('src'),`
			`# 'similarity': re.search('\\d+', parent.tr[4].td.string).group(0),`
			`# 'database': 'Harry.lu'`
			`# }`
			`#`
			`# return result`
			`# else:`
			`# return False`


Change semantics to clarify action 2019-09-24 20:03:06 -04:00			`async def query_kheina(url):`
Fix Kheina parsing and eval 2019-09-24 19:13:10 -04:00			`content = await u.fetch('https://kheina.com', params={'url': url}, text=True)`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00
Fix Kheina parsing and eval 2019-09-24 19:13:10 -04:00			`content = content.replace('"', 'quot;').replace(''', 'apos;')`
			`soup = BeautifulSoup(content, 'html5lib')`
			`results = soup.find('data', id='results').string.replace('quot;', '"').replace('apos;', ''')`
			`results = ast.literal_eval(results)`
			`iqdbdata = soup.find('data', id='iqdbdata').string`
			`iqdbdata = ast.literal_eval(iqdbdata)`

Change similarity logic to return False instead of processing everything 2019-09-24 22:18:27 -04:00			`similarity = int(float(iqdbdata[0]['similarity']))`
			`if similarity < 55:`
			`return False`

Fix Kheina parsing and eval 2019-09-24 19:13:10 -04:00			`for e in results:`
			`if iqdbdata[0]['iqdbid'] in e:`
			`match = e`
			`break`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00
			`result = {`
Fix Kheina parsing and eval 2019-09-24 19:13:10 -04:00			`'source': match[3],`
			`'artist': match[4],`
			`'thumbnail': f'https://f002.backblazeb2.com/file/kheinacom/{match[1]}.jpg',`
Change similarity logic to return False instead of processing everything 2019-09-24 22:18:27 -04:00			`'similarity': str(similarity),`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`'database': 'Kheina'`
Change similarity logic to return False instead of processing everything 2019-09-24 22:18:27 -04:00			`}`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00
			`return result`


Change semantics to clarify action 2019-09-24 20:03:06 -04:00			`async def query_saucenao(url):`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`content = await u.fetch(`
			`'https://saucenao.com/search.php',`
			`params={'url': url, 'api_key': u.config['saucenao_api'], 'output_type': 2},`
			`json=True)`
Fix Kheina parsing and eval 2019-09-24 19:13:10 -04:00
Remove preference for e621, change wording to match other uses 2019-09-24 20:05:18 -04:00			`match = content['results'][0]`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00
Change similarity logic to return False instead of processing everything 2019-09-24 22:18:27 -04:00			`similarity = int(float(match['header']['similarity']))`
			`if similarity < 55:`
			`return False`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`artist = 'member_name'`
Remove preference for e621, change wording to match other uses 2019-09-24 20:05:18 -04:00			`elif 'creator' in match['data']:`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`artist = 'creator'`
Remove preference for e621, change wording to match other uses 2019-09-24 20:05:18 -04:00			`else:`
			`artist = 'imdb_id'`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00
			`result = {`
Remove preference for e621, change wording to match other uses 2019-09-24 20:05:18 -04:00			`'source': match['data']['ext_urls'][0],`
			`'artist': match['data'][artist],`
			`'thumbnail': match['header']['thumbnail'],`
Change similarity logic to return False instead of processing everything 2019-09-24 22:18:27 -04:00			`'similarity': str(similarity),`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`'database': 'SauceNAO'`
			`}`

			`return result`


get_image() from post 2017-10-16 02:06:33 -04:00			`async def get_post(url):`
Added check for filesize before requesting from iqdb 2017-11-20 02:12:56 -05:00			`try:`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`content = await u.fetch(url, response=True)`
			`filesize = int(content.headers['Content-Length'])`
Added check for filesize before requesting from iqdb 2017-11-20 02:12:56 -05:00			`if filesize > 8192 * 1024:`
			`raise exc.SizeError(size(filesize, system=alternative))`
Remove manual rate limiting discord.py now handles this internally 2018-11-03 17:00:36 -04:00
Change semantics to clarify action 2019-09-24 20:03:06 -04:00			`result = await query_kheina(url)`
Change similarity logic to return False instead of processing everything 2019-09-24 22:18:27 -04:00			`if not result:`
Change semantics to clarify action 2019-09-24 20:03:06 -04:00			`result = await query_saucenao(url)`
Change similarity logic to return False instead of processing everything 2019-09-24 22:18:27 -04:00			`if not result:`
Refactor reverse commands, add Kheina database, and remove Harry.lu 2019-09-23 02:06:13 -04:00			`raise exc.MatchError(re.search('\\/([^\\/]+)$', url).group(1))`

			`return result`

			`except aiohttp.InvalidURL:`
Add support for SauceNAO as fallback reverse image search engine 2019-09-17 02:59:35 -04:00			`raise exc.MissingArgument`
get_image() from post 2017-10-16 02:06:33 -04:00

			`async def get_image(url):`
2 > 4 space 2017-10-20 16:23:27 -04:00			`content = await u.fetch(url)`
get_image() from post 2017-10-16 02:06:33 -04:00
Change package reference to avoid clashes 2019-09-24 19:12:31 -04:00			`value = lxml.html.fromstring(content).xpath(`
2 > 4 space 2017-10-20 16:23:27 -04:00			`'string(/html/body/div[@id="content"]/div[@id="post-view"]/div[@class="content"]/div[2]/img/@src)')`
get_image() from post 2017-10-16 02:06:33 -04:00
2 > 4 space 2017-10-20 16:23:27 -04:00			`return value`