modufur/src/utils/scraper.py

import asyncio
import re

from bs4 import BeautifulSoup
from lxml import html
from hurry.filesize import size, alternative

from misc import exceptions as exc
from utils import utils as u


async def get_post(url):
    try:
        image = await u.fetch(url, response=True)
        filesize = int(image.headers['Content-Length'])
        if filesize > 8192 * 1024:
            raise exc.SizeError(size(filesize, system=alternative))

        content = await u.fetch('http://iqdb.harry.lu', params={'url': url})
        soup = BeautifulSoup(content, 'html.parser')
        source = soup.find_all('a', limit=2)[1].get('href')

        if source != '#':
            ident = re.search('show/([0-9]+)', source).group(1)
            post = await u.fetch('http://e621.net/post/show.json', params={'id': ident}, json=True)
            if (post['status'] == 'deleted'):
                ident = re.search('#(\\d+)', post['delreason']).group(1)
                post = await u.fetch('http://e621.net/post/show.json', params={'id': ident}, json=True)
            source = f'https://e621.net/post/show/{post["id"]}'
            similarity = re.search('\\d+', soup.find(string=re.compile('similarity'))).group(0)

            return post, source, similarity + '% Match'
        else:
            raise IndexError

    except IndexError:
        content = await u.fetch(
            'https://saucenao.com/search.php',
            params={
                'url': url,
                'api_key': u.config['saucenao_api'],
                'output_type': 2},
            json=True)
        result = content['results'][0]
        if 'author_name' in result['data']:
            artist = 'author_name'
        elif 'member_name' in result['data']:
            artist = 'member_name'
        else:
            artist = 'creator'
        post = {
            'file_url': result['header']['thumbnail'],
            'artist': [result['data'][artist]],
            'score': 'SauceNAO'}
        source = result['data']['ext_urls'][0]
        similarity = re.search('(\\d+)\\.', result['header']['similarity']).group(1)

        if int(similarity) >= 55:
            return post, source, similarity + '% Match'

        raise exc.MatchError(re.search('\\/([^\\/]+)$', url).group(1))

    except (AttributeError, ValueError, KeyError):
        raise exc.MissingArgument


async def get_image(url):
    content = await u.fetch(url)

    value = html.fromstring(content).xpath(
        'string(/html/body/div[@id="content"]/div[@id="post-view"]/div[@class="content"]/div[2]/img/@src)')

    return value
Lowered rate for iqdb requests to reduce possibility of ip ban 2017-10-27 21:31:57 -04:00			`import asyncio`
Caught argument errors and rxed the exception for output 2017-10-15 15:32:35 -04:00			`import re`

Initial commit 2017-09-24 11:05:28 -04:00			`from bs4 import BeautifulSoup`
			`from lxml import html`
Added check for filesize before requesting from iqdb 2017-11-20 02:12:56 -05:00			`from hurry.filesize import size, alternative`
Removed redundant scraper methods, autopep8 2017-10-12 22:30:40 -04:00
Initial commit 2017-09-24 11:05:28 -04:00			`from misc import exceptions as exc`
Removed redundant scraper methods, autopep8 2017-10-12 22:30:40 -04:00			`from utils import utils as u`

Initial commit 2017-09-24 11:05:28 -04:00
get_image() from post 2017-10-16 02:06:33 -04:00			`async def get_post(url):`
Added check for filesize before requesting from iqdb 2017-11-20 02:12:56 -05:00			`try:`
			`image = await u.fetch(url, response=True)`
			`filesize = int(image.headers['Content-Length'])`
			`if filesize > 8192 * 1024:`
			`raise exc.SizeError(size(filesize, system=alternative))`
Remove manual rate limiting discord.py now handles this internally 2018-11-03 17:00:36 -04:00
Add support for SauceNAO as fallback reverse image search engine 2019-09-17 02:59:35 -04:00			`content = await u.fetch('http://iqdb.harry.lu', params={'url': url})`
			`soup = BeautifulSoup(content, 'html.parser')`
			`source = soup.find_all('a', limit=2)[1].get('href')`
2 > 4 space 2017-10-20 16:23:27 -04:00
Add support for SauceNAO as fallback reverse image search engine 2019-09-17 02:59:35 -04:00			`if source != '#':`
			`ident = re.search('show/([0-9]+)', source).group(1)`
Unified qis and ris into respective ris commands 2017-11-19 23:25:30 -05:00			`post = await u.fetch('http://e621.net/post/show.json', params={'id': ident}, json=True)`
Fix some reasons bot may encounter HTTP exceptions when reversing images 2019-04-28 11:07:36 -04:00			`if (post['status'] == 'deleted'):`
			`ident = re.search('#(\\d+)', post['delreason']).group(1)`
			`post = await u.fetch('http://e621.net/post/show.json', params={'id': ident}, json=True)`
Add support for SauceNAO as fallback reverse image search engine 2019-09-17 02:59:35 -04:00			`source = f'https://e621.net/post/show/{post["id"]}'`
Add limit to similarity of search result 2019-09-22 13:39:04 -04:00			`similarity = re.search('\\d+', soup.find(string=re.compile('similarity'))).group(0)`
Fix some reasons bot may encounter HTTP exceptions when reversing images 2019-04-28 11:07:36 -04:00
Add limit to similarity of search result 2019-09-22 13:39:04 -04:00			`return post, source, similarity + '% Match'`
2 > 4 space 2017-10-20 16:23:27 -04:00			`else:`
			`raise IndexError`
Whitespace inconsistency 2017-10-21 16:39:11 -04:00
2 > 4 space 2017-10-20 16:23:27 -04:00			`except IndexError:`
Add support for SauceNAO as fallback reverse image search engine 2019-09-17 02:59:35 -04:00			`content = await u.fetch(`
			`'https://saucenao.com/search.php',`
			`params={`
			`'url': url,`
			`'api_key': u.config['saucenao_api'],`
			`'output_type': 2},`
			`json=True)`
			`result = content['results'][0]`
			`if 'author_name' in result['data']:`
			`artist = 'author_name'`
			`elif 'member_name' in result['data']:`
			`artist = 'member_name'`
			`else:`
			`artist = 'creator'`
			`post = {`
			`'file_url': result['header']['thumbnail'],`
			`'artist': [result['data'][artist]],`
			`'score': 'SauceNAO'}`
			`source = result['data']['ext_urls'][0]`
Add limit to similarity of search result 2019-09-22 13:39:04 -04:00			`similarity = re.search('(\\d+)\\.', result['header']['similarity']).group(1)`
Add support for SauceNAO as fallback reverse image search engine 2019-09-17 02:59:35 -04:00
Add limit to similarity of search result 2019-09-22 13:39:04 -04:00			`if int(similarity) >= 55:`
			`return post, source, similarity + '% Match'`
Removed redundant scraper methods, autopep8 2017-10-12 22:30:40 -04:00
Add support for SauceNAO as fallback reverse image search engine 2019-09-17 02:59:35 -04:00			`raise exc.MatchError(re.search('\\/([^\\/]+)$', url).group(1))`

			`except (AttributeError, ValueError, KeyError):`
			`raise exc.MissingArgument`
get_image() from post 2017-10-16 02:06:33 -04:00

			`async def get_image(url):`
2 > 4 space 2017-10-20 16:23:27 -04:00			`content = await u.fetch(url)`
get_image() from post 2017-10-16 02:06:33 -04:00
2 > 4 space 2017-10-20 16:23:27 -04:00			`value = html.fromstring(content).xpath(`
			`'string(/html/body/div[@id="content"]/div[@id="post-view"]/div[@class="content"]/div[2]/img/@src)')`
get_image() from post 2017-10-16 02:06:33 -04:00
2 > 4 space 2017-10-20 16:23:27 -04:00			`return value`