1
0
Fork 0
mirror of https://github.com/myned/modufur.git synced 2025-03-13 17:16:25 +00:00
modufur/src/utils/scraper.py

127 lines
3.7 KiB
Python
Raw Normal View History

import aiohttp
import ast
import re
2017-09-24 11:05:28 -04:00
from bs4 import BeautifulSoup
import lxml
from hurry.filesize import size, alternative
2017-09-24 11:05:28 -04:00
from misc import exceptions as exc
from utils import utils as u
2017-09-24 11:05:28 -04:00
# async def get_harry(url):
# content = await u.fetch('https://iqdb.harry.lu', params={'url': url})
2019-09-24 19:13:10 -04:00
# soup = BeautifulSoup(content, 'html5lib')
#
# if soup.find('div', id='show1').string is 'Not the right one? ':
# parent = soup.find('th', string='Probable match:').parent.parent
#
# post = await u.fetch(
# 'https://e621.net/post/show.json',
# params={'id': re.search('show/([0-9]+)', parent.tr.td.a.get('href')).group(1)},
# json=True)
# if (post['status'] == 'deleted'):
# post = await u.fetch(
# 'https://e621.net/post/show.json',
# params={'id': re.search('#(\\d+)', post['delreason']).group(1)},
# json=True)
#
# result = {
# 'source': f'https://e621.net/post/show/{post["id"]}',
# 'artist': ', '.join(post['artist']),
# 'thumbnail': parent.td.a.img.get('src'),
# 'similarity': re.search('\\d+', parent.tr[4].td.string).group(0),
# 'database': 'Harry.lu'
# }
#
# return result
# else:
# return False
2019-09-24 20:03:06 -04:00
async def query_kheina(url):
2019-09-24 19:13:10 -04:00
content = await u.fetch('https://kheina.com', params={'url': url}, text=True)
2019-09-24 19:13:10 -04:00
content = content.replace('"', 'quot;').replace(''', 'apos;')
soup = BeautifulSoup(content, 'html5lib')
results = soup.find('data', id='results').string.replace('quot;', '"').replace('apos;', ''')
results = ast.literal_eval(results)
iqdbdata = soup.find('data', id='iqdbdata').string
iqdbdata = ast.literal_eval(iqdbdata)
similarity = int(float(iqdbdata[0]['similarity']))
if similarity < 55:
return False
2019-09-24 19:13:10 -04:00
for e in results:
if iqdbdata[0]['iqdbid'] in e:
match = e
break
result = {
2019-09-24 19:13:10 -04:00
'source': match[3],
'artist': match[4],
'thumbnail': f'https://f002.backblazeb2.com/file/kheinacom/{match[1]}.jpg',
'similarity': str(similarity),
'database': 'Kheina'
}
return result
2019-09-24 20:03:06 -04:00
async def query_saucenao(url):
content = await u.fetch(
'https://saucenao.com/search.php',
params={'url': url, 'api_key': u.config['saucenao_api'], 'output_type': 2},
json=True)
2019-09-24 19:13:10 -04:00
match = content['results'][0]
similarity = int(float(match['header']['similarity']))
if similarity < 55:
return False
artist = 'member_name'
elif 'creator' in match['data']:
artist = 'creator'
else:
artist = 'imdb_id'
result = {
'source': match['data']['ext_urls'][0],
'artist': match['data'][artist],
'thumbnail': match['header']['thumbnail'],
'similarity': str(similarity),
'database': 'SauceNAO'
}
return result
2017-10-16 02:06:33 -04:00
async def get_post(url):
try:
content = await u.fetch(url, response=True)
filesize = int(content.headers['Content-Length'])
if filesize > 8192 * 1024:
raise exc.SizeError(size(filesize, system=alternative))
2019-09-24 20:03:06 -04:00
result = await query_kheina(url)
if not result:
2019-09-24 20:03:06 -04:00
result = await query_saucenao(url)
if not result:
raise exc.MatchError(re.search('\\/([^\\/]+)$', url).group(1))
return result
except aiohttp.InvalidURL:
raise exc.MissingArgument
2017-10-16 02:06:33 -04:00
async def get_image(url):
2017-10-20 16:23:27 -04:00
content = await u.fetch(url)
2017-10-16 02:06:33 -04:00
value = lxml.html.fromstring(content).xpath(
2017-10-20 16:23:27 -04:00
'string(/html/body/div[@id="content"]/div[@id="post-view"]/div[@class="content"]/div[2]/img/@src)')
2017-10-16 02:06:33 -04:00
2017-10-20 16:23:27 -04:00
return value