delojza/delojza.py

514 lines
22 KiB
Python
Raw Normal View History

2018-01-31 12:32:25 +01:00
#!/usr/bin/env python3
2018-02-02 15:28:49 +01:00
import errno
2018-01-30 13:47:33 +01:00
import logging
import os
2018-01-31 14:34:59 +01:00
import re
2018-01-31 14:11:42 +01:00
import shutil
2018-01-31 12:30:08 +01:00
import sys
2019-05-03 11:41:54 +02:00
import tempfile
from configparser import ConfigParser
from datetime import datetime, timedelta
2018-01-31 14:23:01 +01:00
from glob import glob
from operator import itemgetter
2018-01-30 13:47:33 +01:00
2019-05-02 19:18:35 +02:00
import acoustid
2018-01-31 14:34:59 +01:00
import filetype
2018-10-02 16:12:32 +02:00
import markovify
2019-05-01 10:50:21 +02:00
import mutagen.id3
2018-10-28 14:00:25 +01:00
import pytumblr
2018-01-30 13:47:33 +01:00
import requests
import telegram
2018-01-30 13:47:33 +01:00
import youtube_dl
from telegram import MessageEntity
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
from youtube_dl.version import __version__ as YTDL_VERSION
2018-01-30 13:47:33 +01:00
2018-02-02 15:28:49 +01:00
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
2018-02-02 15:28:49 +01:00
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def datestr(date):
return date.strftime("%Y-%m-%d@%H%M")
2018-04-25 14:30:08 +02:00
class DelojzaBot:
def __init__(self, tg_api_key, out_dir, tmp_dir=None,
acoustid_key=None, tumblr_name=None, tumblr_keys=None, markov=None):
2019-05-01 10:39:29 +02:00
self.logger = logging.getLogger("delojza")
self.out_dir = os.path.abspath(out_dir)
self.logger.debug('OUT_DIR: ' + out_dir)
2019-05-03 11:41:54 +02:00
self.tmp_dir = tmp_dir if tmp_dir else tempfile.gettempdir()
self.logger.debug('TMP_DIR: ' + tmp_dir)
self.markov = markov
self.updater = Updater(tg_api_key)
dp = self.updater.dispatcher
dp.add_handler(CommandHandler("start", self.tg_start))
dp.add_error_handler(self.tg_error)
2019-04-18 17:40:48 +02:00
self.tg_url_handler = MessageHandler(Filters.entity(MessageEntity.URL), self.tg_handle_url)
dp.add_handler(self.tg_url_handler)
self.tg_rest_handler = MessageHandler(Filters.photo | Filters.video | Filters.video_note |
Filters.audio | Filters.voice | Filters.document, self.tg_handle_rest)
dp.add_handler(self.tg_rest_handler)
dp.add_handler(MessageHandler(Filters.entity(MessageEntity.HASHTAG), self.tg_handle_hashtag))
dp.add_handler(MessageHandler(Filters.text, self.tg_handle_text))
dp.add_handler(CommandHandler("stats", self.tg_stats))
dp.add_handler(CommandHandler("orphans", self.tg_orphan))
dp.add_handler(CommandHandler("orphans_full", self.tg_orphan_full))
2019-05-02 15:53:49 +02:00
dp.add_handler(CommandHandler("delete", self.tg_delete))
dp.add_handler(CommandHandler("version", self.tg_version))
2019-05-02 19:18:35 +02:00
self.acoustid_key = acoustid_key
if tumblr_name and tumblr_keys:
self.tumblr_name = tumblr_name
self.tumblr_client = pytumblr.TumblrRestClient(*tumblr_keys)
self.last_downloaded = None
self.last_hashtags = None
@staticmethod
def ytdl_can(url):
ies = youtube_dl.extractor.gen_extractors()
for ie in ies:
if ie.suitable(url) and ie.IE_NAME != 'generic' \
and '/channel/' not in url:
# Site has dedicated extractor
return True
return False
2019-05-02 19:18:35 +02:00
def tag_file(self, filepath, message, info=None):
if info is None:
info = {}
2019-05-01 12:23:30 +02:00
title = None
artist = None
2019-05-02 19:18:35 +02:00
source = None
2019-05-01 12:23:30 +02:00
if 'track' in info:
title = info['track']
if 'artist' in info:
artist = info['artist']
2019-05-02 19:18:35 +02:00
if 'track' in info or 'artist' in info:
source = "supplied metadata"
if title is None or artist is None and self.acoustid_key:
try:
self.logger.debug("Requesting AcoustID for {}".format(filepath))
results = sorted(acoustid.match(self.acoustid_key, filepath), key=itemgetter(0), reverse=True)
if len(results) > 0:
score, rid, aid_title, aid_artist = results[0]
if score > .8:
title = aid_title
artist = aid_artist
source = "AcoustID ({}%)".format(round(score * 100))
except acoustid.NoBackendError:
self.logger.warning("chromaprint library/tool not found")
except acoustid.FingerprintGenerationError:
self.logger.warning("fingerprint could not be calculated")
except acoustid.WebServiceError as exc:
self.logger.warning("web service request failed: {}".format(exc.message))
if title is None and artist is None and '-' in info.get("title", ""):
2019-05-01 12:23:30 +02:00
split = info['title'].split("-")
artist = split[0]
title = split[1]
2019-05-02 19:18:35 +02:00
source = "fallback (artist - title)"
if title is None and 'title' in info:
2019-05-01 12:23:30 +02:00
title = info['title']
2019-05-02 19:18:35 +02:00
source = "full title fallback"
if 'soundcloud' in info.get("extractor", "") and artist is None:
2019-05-01 12:23:30 +02:00
artist = info['uploader']
2019-05-02 19:18:35 +02:00
source = "soundcloud \"fallback\""
2019-05-01 12:23:30 +02:00
2019-05-02 19:18:35 +02:00
artist = artist.strip() if artist else None
title = title.strip() if title else None
if title is None and artist is None:
message.reply_text("Tried tagging, found nothing :(")
return
2019-05-02 19:18:35 +02:00
message.reply_text("Tagging as \"{}\" by \"{}\"\nvia {}".format(title, artist, source))
self.logger.info("Tagging {} w/ {} - {} [{}]...".format(filepath, title, artist, source))
2019-05-02 19:18:35 +02:00
try:
id3 = mutagen.id3.ID3(filepath)
except mutagen.id3.ID3NoHeaderError:
mutafile = mutagen.File(filepath)
mutafile.add_tags()
mutafile.save()
id3 = mutagen.id3.ID3(filepath)
id3.add(mutagen.id3.TIT2(encoding=3, text=title))
if artist:
id3.add(mutagen.id3.TOPE(encoding=3, text=artist))
id3.add(mutagen.id3.TPE1(encoding=3, text=artist))
id3.save()
# noinspection PyUnusedLocal
def download_ytdl(self, urls, out_path, date, message, audio=False, filename=None):
ydl_opts = {
'noplaylist': True,
'restrictfilenames': True,
2019-05-01 10:50:51 +02:00
'outtmpl': os.path.join(self.tmp_dir, '{}__%(title)s__%(id)s.%(ext)s'.format(datestr(date)))
}
2019-05-02 19:18:35 +02:00
if audio:
ydl_opts['format'] = 'bestaudio/best'
2019-04-18 16:37:27 +02:00
ydl_opts['postprocessors'] = [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '256'
}]
2019-05-02 15:53:49 +02:00
filenames = []
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download(urls)
2019-05-01 10:50:21 +02:00
for info in [ydl.extract_info(url, download=False) for url in urls]:
filename = ydl.prepare_filename(info)
globbeds = glob(os.path.splitext(filename)[0] + '.*')
for globbed in globbeds:
2019-05-01 10:50:21 +02:00
if globbed.endswith("mp3"):
2019-05-02 19:18:35 +02:00
self.tag_file(globbed, message, info=info)
self.logger.info("Moving %s to %s..." % (globbed, out_path))
dest = shutil.move(globbed, out_path)
2019-05-02 15:53:49 +02:00
filenames.append(dest)
return filenames
def download_raw(self, urls, out_path, date, message, audio=False, filename=None):
filenames = []
for url in urls:
local_filename = os.path.join(out_path, "%s__%s" % (datestr(date), filename or url.split('/')[-1]))
2019-05-02 19:18:35 +02:00
final_filename = local_filename
is_mp3 = local_filename.endswith("mp3")
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
2019-05-02 19:18:35 +02:00
if not re.match(r'.*\..{3,5}$', os.path.split(local_filename)[-1]):
kind = filetype.guess(local_filename)
if kind is None:
self.logger.error("File has no extension and could not be determined!")
else:
self.logger.info('Moving file without extension... %s?' % kind.extension)
2019-05-02 19:18:35 +02:00
final_filename = shutil.move(local_filename, local_filename + '.' + kind.extension)
is_mp3 = kind.extension == "mp3"
filenames.append(final_filename)
if audio and is_mp3:
2019-05-03 11:45:46 +02:00
try:
id3 = mutagen.id3.ID3(final_filename)
untagged = 'TIT2' not in id3
except mutagen.id3.ID3NoHeaderError:
untagged = True
if untagged:
self.tag_file(final_filename, message)
2019-05-02 19:18:35 +02:00
2018-10-28 14:08:57 +01:00
return filenames
2018-01-30 13:47:33 +01:00
2019-04-18 17:40:48 +02:00
@staticmethod
def extract_hashtags(message):
hashtags = list(map(message.parse_entity,
list(filter(lambda e: e.type == 'hashtag', message.entities))))
hashtags += list(map(message.parse_caption_entity,
list(filter(lambda e: e.type == 'hashtag', message.caption_entities))))
2019-04-18 17:40:48 +02:00
if len(hashtags) > 0:
hashtags = [hashtag[1:].upper() for hashtag in hashtags]
for i, hashtag in enumerate(hashtags):
if "PRAS" in hashtag:
hashtags[i] = "PRAS"
return hashtags
def get_hashtags(self, message):
hashtags = self.extract_hashtags(message)
if len(hashtags) == 0 and self.last_hashtags is not None:
user, chat, ts, hashtags = self.last_hashtags
if user == message.from_user and chat == message.chat and ts > datetime.now() - timedelta(hours=1):
hashtags = self.last_hashtags[1]
return hashtags
2019-04-18 17:40:48 +02:00
def tg_handle_hashtag(self, bot, update):
hashtags = self.extract_hashtags(update.message)
2019-04-18 17:40:48 +02:00
if update.message.reply_to_message:
self.handle_tg_message(update.message.reply_to_message, bot, hashtags)
self.handle_urls(update.message.reply_to_message, hashtags)
2019-04-18 17:40:48 +02:00
else:
self.last_hashtags = update.message.from_user, update.message.chat, datetime.now(), hashtags
# noinspection PyBroadException
def handle(self, urls, message, hashtags, download_fn, filename=None):
try:
if len(hashtags) == 0:
self.logger.info("Ignoring %s due to no hashtag present..." % urls)
return
self.last_hashtags = None
self.logger.info("Downloading %s under '%s'" % (urls, "/".join(hashtags)))
out_path = os.path.join(self.out_dir, *hashtags)
mkdir_p(out_path)
reply = 'Downloading to "{}"...'.format("/".join(hashtags))
audio = any([any([tag in hashtag for tag in ('AUDIO', 'RADIO')]) for hashtag in hashtags])
2019-05-02 19:26:39 +02:00
if audio and download_fn != self.download_raw:
reply += ' (And also guessing you want to extract the audio)'
message.reply_text(reply)
2019-04-18 12:54:10 +02:00
filenames = download_fn(urls, out_path, message.date, message, audio=audio, filename=filename)
tumblr_ids = []
if hashtags[0] == 'TUMBLR' and self.tumblr_client:
message.reply_text('(btw, queueing to tumblr)')
for filename in filenames:
response = self.tumblr_client.create_photo(self.tumblr_name, state="queue", data=filename)
tumblr_ids.append(response['id'])
elif hashtags[0] == 'TUMBLR_NOW' and self.tumblr_client:
message.reply_text('(btw, ***FIRING TO TUMBLR RIGHT AWAY***)',
parse_mode=telegram.ParseMode.MARKDOWN)
for filename in filenames:
response = self.tumblr_client.create_photo(self.tumblr_name, state="published", data=filename)
tumblr_ids.append(response['id'])
self.last_downloaded = message.chat, filenames, hashtags, tumblr_ids
return filenames
except:
2019-05-15 11:28:36 +02:00
exc_type, exc_value, __ = sys.exc_info()
if "Timed out" not in str(exc_value):
2019-05-15 11:28:36 +02:00
message.reply_text("Something is FUCKED: [{}] {}".format(exc_type, exc_value))
2019-04-18 16:34:56 +02:00
def tg_handle_url(self, _, update):
self.handle_urls(update.message, self.get_hashtags(update.message))
2019-04-18 17:40:48 +02:00
def handle_urls(self, message, hashtags):
2019-04-18 17:40:48 +02:00
urls = list(map(lambda e: message.parse_entity(e),
filter(lambda e: e.type == 'url', message.entities)))
2019-04-18 13:02:37 +02:00
ytdl_urls = [url for url in urls if self.ytdl_can(url)]
normal_urls = [url for url in urls if not self.ytdl_can(url)]
if len(ytdl_urls) > 0:
self.handle(ytdl_urls, message, hashtags, self.download_ytdl)
2019-04-18 13:02:37 +02:00
if len(normal_urls) > 0:
file_urls = [url for url in normal_urls if
"text" not in requests.head(url).headers.get("Content-Type", "text")]
if len(file_urls) > 0:
self.handle(file_urls, message, hashtags, self.download_raw)
# noinspection PyBroadException
2019-04-18 16:34:56 +02:00
def tg_handle_rest(self, bot, update):
self.handle_tg_message(update.message, bot, self.get_hashtags(update.message))
2019-04-18 17:40:48 +02:00
def handle_tg_message(self, message, bot, hashtag):
file, filename, tumblr = None, None, False
2019-04-18 17:40:48 +02:00
if len(message.photo) > 0:
photo = max(message.photo, key=lambda p: p.width)
file = photo.file_id
2019-04-18 17:40:48 +02:00
elif message.document is not None:
filename = message.document.file_name
file = message.document.file_id
elif message.audio is not None:
filename = message.audio.title
file = message.audio.file_id
elif message.video is not None:
file = message.video.file_id
elif message.video_note is not None:
file = message.video_note.file_id
elif message.voice is not None:
file = message.voice.file_id
if file is not None:
url = bot.getFile(file).file_path
2019-04-18 17:40:48 +02:00
self.handle([url], message, hashtag, self.download_raw, filename=filename)
2019-04-18 16:34:56 +02:00
def tg_handle_text(self, _, update):
2019-04-18 16:11:57 +02:00
if self.markov:
self.markov.add_to_corpus(update.message.text)
def tag_dirs(self):
return list(filter(lambda x: x.upper() == x,
filter(lambda directory: os.path.isdir(os.path.join(self.out_dir, directory)),
os.listdir(self.out_dir))))
def tg_stats(self, _, update):
tag_dirs = self.tag_dirs()
reply = "Total number of tags: {}\n\n".format(len(tag_dirs))
counts = [(directory, os.listdir(os.path.join(self.out_dir, directory))) for directory in tag_dirs]
counts.sort(key=itemgetter(0))
counts.sort(key=lambda x: len(x[1]), reverse=True)
for directory, files in counts:
if len(files) == 1:
break
abs_paths = [os.path.join(self.out_dir, directory, file) for file in files]
abs_files = list(filter(os.path.isfile, abs_paths))
2019-05-01 13:39:49 +02:00
# mimes = [magic.from_file(path, mime=True).split("/")[0] for path in abs_files]
# mime_counts = [(mime, mimes.count(mime)) for mime in set(mimes)]
exts = [ext[1:] for ext in [os.path.splitext(path)[1] for path in abs_files] if len(ext) > 0]
ext_counts = [(ext, exts.count(ext)) for ext in set(exts)]
dir_cnt = len(abs_paths) - len(abs_files)
2019-05-01 13:39:49 +02:00
type_counts = ext_counts + ([("directorie", dir_cnt)] if dir_cnt > 0 else [])
details = ", ".join(["{} {}s".format(cnt, mime) for mime, cnt in
sorted(type_counts, key=itemgetter(1), reverse=True)])
if len(type_counts) == 1:
reply += "<b>{}:</b> {}\n".format(directory, details)
else:
reply += "<b>{}:</b> {} files ({})\n".format(directory, len(files), details)
orphans = list(filter(lambda cnt: len(cnt[1]) <= 1, counts))
if len(orphans) > 0:
reply += "\nFollowing tags are orphans: " + ", ".join(map(itemgetter(0), orphans))
update.message.reply_text(reply, parse_mode=telegram.ParseMode.HTML)
def orphans(self):
result = []
tag_dirs = self.tag_dirs()
for directory in tag_dirs:
files = os.listdir(os.path.join(self.out_dir, directory))
if len(files) == 1:
result.append((directory, files[0]))
if len(files) == 0:
result.append((directory, "NO FILE AT ALL..."))
return sorted(result, key=itemgetter(0))
def tg_orphan(self, _, update):
orphans = self.orphans()
if len(orphans) == 0:
update.message.reply_text("Good job, no orphan tags!")
else:
update.message.reply_text("The following tags only contain a single file:\n" +
", ".join(map(itemgetter(0), orphans)))
def tg_orphan_full(self, _, update):
orphans = self.orphans()
if len(orphans) == 0:
update.message.reply_text("Good job, no orphan tags!")
else:
tmp_reply = "The following tags only contain a single file:\n"
for directory, file in orphans:
line = "{}: {}\n".format(directory, file)
if len(tmp_reply + line) > 4096:
update.message.reply_text(tmp_reply)
tmp_reply = ""
tmp_reply += line
if len(tmp_reply) > 0:
update.message.reply_text(tmp_reply)
2019-05-02 15:53:49 +02:00
def tg_delete(self, _, update):
if self.last_downloaded is not None:
chat, files, hashtags, tumblr_ids = self.last_downloaded
if chat == update.message.chat:
for file in files:
update.message.reply_text("Removing \"{}\"!".format(file[len(self.out_dir) + 1:]))
os.remove(file)
parent_dir = os.path.dirname(file)
while True:
if len(os.listdir(parent_dir)) == 0:
update.message.reply_text("Removing directory \"{}\" as it's empty..."
.format(parent_dir[len(self.out_dir) + 1:]))
os.rmdir(parent_dir)
parent_dir = os.path.dirname(parent_dir)
if parent_dir == self.out_dir:
break
if len(tumblr_ids) > 0:
plural = "s (all {} of them)".format(len(tumblr_ids)) if len(tumblr_ids) > 1 else ""
update.message.reply_text("Also deleting tumblr post{}!".format(plural))
for tumblr_id in tumblr_ids:
if self.tumblr_client:
self.tumblr_client.delete_post(self.tumblr_name, tumblr_id)
self.last_downloaded = None
return
update.message.reply_text("Nothing to remove!")
2019-05-02 15:53:49 +02:00
# noinspection PyMethodMayBeStatic
def tg_version(self, _, update):
delojza_date = datetime.fromtimestamp(os.path.getmtime(os.path.realpath(__file__))) \
.strftime('%Y/%m/%d - %H:%M:%S')
update.message.reply_text("delojza modified date: {}\nyoutube-dl version: {}"
.format(delojza_date, YTDL_VERSION))
2019-04-18 12:54:10 +02:00
def tg_start(self, _, update):
2019-04-18 16:11:57 +02:00
update.message.reply_text(self.markov.make_sentence() if self.markov else "HELLO")
def tg_error(self, bot, update, error):
self.logger.error(error)
if "Timed out" in str(error):
if update is not None:
2019-04-18 16:11:57 +02:00
default = "Mmmm, I like it..."
update.message.reply_text((self.markov.make_sentence(tries=100) if self.markov else default) or default)
2019-04-18 16:34:56 +02:00
self.tg_handle_rest(bot, update)
else:
if update is not None:
update.message.reply_text("Something is fucked: %s" % error)
2018-01-30 13:47:33 +01:00
def run_idle(self):
self.updater.start_polling()
self.logger.info("Started Telegram bot...")
self.updater.idle()
2018-01-30 13:47:33 +01:00
class MarkovBlabberer:
def __init__(self, filepath):
self.logger = logging.getLogger('markov')
self.filepath = filepath
2018-01-30 13:47:33 +01:00
with open(filepath) as f:
text = f.read()
self.markov = markovify.NewlineText(text.lower())
self.logger.info("Sentence of the day: " + self.make_sentence())
2018-01-30 13:47:33 +01:00
def make_sentence(self, tries=100):
return self.markov.make_sentence(tries=tries)
2018-01-30 13:47:33 +01:00
def add_to_corpus(self, text):
text = text.lower()
new_sentence = markovify.NewlineText(text)
self.markov = markovify.combine([self.markov, new_sentence])
with open(self.filepath, 'a') as f:
f.write(text + '\n')
2018-01-30 13:47:33 +01:00
if __name__ == '__main__':
2019-04-18 16:19:21 +02:00
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
2018-01-30 13:47:33 +01:00
_DIR_ = os.path.dirname(os.path.realpath(__file__))
CONFIG_PATHS = ['/etc/delojza/delojza.ini',
2019-04-18 13:12:16 +02:00
os.path.join(os.getenv("HOME") or "", ".config/delojza/delojza.ini"),
os.path.join(_DIR_, "delojza.ini")]
2018-01-30 13:47:33 +01:00
config = ConfigParser()
try:
CONF_FILE = next(conf_path for conf_path in CONFIG_PATHS if os.path.isfile(conf_path))
config.read(CONF_FILE)
except StopIteration:
logging.error("No config file found, stopping.")
sys.exit(-1)
2019-05-02 19:26:25 +02:00
try:
markov = MarkovBlabberer("initial.txt")
except FileNotFoundError:
markov = None
delojza = DelojzaBot(config.get('delojza', 'tg_api_key'),
config.get('delojza', 'OUT_DIR', fallback=os.path.join(_DIR_, "out")),
2019-05-03 11:41:54 +02:00
tmp_dir=config.get('delojza', 'tmp_dir', fallback=tempfile.gettempdir()),
2019-05-02 19:18:35 +02:00
acoustid_key=config.get('delojza', 'acoustid_api_key'),
tumblr_name=config.get('tumblr', 'blog_name'),
tumblr_keys=(config.get('tumblr', 'consumer_key'),
config.get('tumblr', 'consumer_secret'),
config.get('tumblr', 'oauth_key'),
config.get('tumblr', 'oauth_secret')),
2019-04-18 13:14:28 +02:00
markov=markov)
delojza.run_idle()