1
0
Fork 0
mirror of https://github.com/myned/modufur.git synced 2025-03-13 17:16:25 +00:00
modufur/src/utils/scraper.py

74 lines
2.5 KiB
Python
Raw Normal View History

import asyncio
import re
2017-09-24 11:05:28 -04:00
from bs4 import BeautifulSoup
from lxml import html
from hurry.filesize import size, alternative
2017-09-24 11:05:28 -04:00
from misc import exceptions as exc
from utils import utils as u
2017-09-24 11:05:28 -04:00
2017-10-16 02:06:33 -04:00
async def get_post(url):
try:
image = await u.fetch(url, response=True)
filesize = int(image.headers['Content-Length'])
if filesize > 8192 * 1024:
raise exc.SizeError(size(filesize, system=alternative))
content = await u.fetch('http://iqdb.harry.lu', params={'url': url})
soup = BeautifulSoup(content, 'html.parser')
source = soup.find_all('a', limit=2)[1].get('href')
2017-10-20 16:23:27 -04:00
if source != '#':
ident = re.search('show/([0-9]+)', source).group(1)
post = await u.fetch('http://e621.net/post/show.json', params={'id': ident}, json=True)
if (post['status'] == 'deleted'):
ident = re.search('#(\\d+)', post['delreason']).group(1)
post = await u.fetch('http://e621.net/post/show.json', params={'id': ident}, json=True)
source = f'https://e621.net/post/show/{post["id"]}'
similarity = re.search('\\d+', soup.find(string=re.compile('similarity'))).group(0)
return post, source, similarity + '% Match'
2017-10-20 16:23:27 -04:00
else:
raise IndexError
2017-10-21 16:39:11 -04:00
2017-10-20 16:23:27 -04:00
except IndexError:
content = await u.fetch(
'https://saucenao.com/search.php',
params={
'url': url,
'api_key': u.config['saucenao_api'],
'output_type': 2},
json=True)
result = content['results'][0]
if 'author_name' in result['data']:
artist = 'author_name'
elif 'member_name' in result['data']:
artist = 'member_name'
else:
artist = 'creator'
post = {
'file_url': result['header']['thumbnail'],
'artist': [result['data'][artist]],
'score': 'SauceNAO'}
source = result['data']['ext_urls'][0]
similarity = re.search('(\\d+)\\.', result['header']['similarity']).group(1)
if int(similarity) >= 55:
return post, source, similarity + '% Match'
raise exc.MatchError(re.search('\\/([^\\/]+)$', url).group(1))
except (AttributeError, ValueError, KeyError):
raise exc.MissingArgument
2017-10-16 02:06:33 -04:00
async def get_image(url):
2017-10-20 16:23:27 -04:00
content = await u.fetch(url)
2017-10-16 02:06:33 -04:00
2017-10-20 16:23:27 -04:00
value = html.fromstring(content).xpath(
'string(/html/body/div[@id="content"]/div[@id="post-view"]/div[@class="content"]/div[2]/img/@src)')
2017-10-16 02:06:33 -04:00
2017-10-20 16:23:27 -04:00
return value