From bfbc225d0e89a46adb59520a39268a9d84cb5be5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Ml=C3=A1dek?= <t@mldk.cz>
Date: Wed, 22 May 2019 15:21:25 +0200
Subject: [PATCH] sanitize filenames

---
 delojza.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/delojza.py b/delojza.py
index 457d328..8b0366c 100755
--- a/delojza.py
+++ b/delojza.py
@@ -92,6 +92,12 @@ class DelojzaBot:
                 return True
         return False
 
+    @staticmethod
+    def sanitize(filepath):
+        if filepath is None:
+            return None
+        return re.sub(r'[^\w.-]', '_', filepath)
+
     def tag_file(self, filepath, message, info=None):
         if info is None:
             info = {}
@@ -162,7 +168,7 @@ class DelojzaBot:
         id3.save()
 
     # noinspection PyUnusedLocal
-    def download_ytdl(self, urls, out_path, date, message, audio=False, filename=None):
+    def download_ytdl(self, urls, out_path, date, message, audio=False, filetitle=None):
         ydl_opts = {
             'noplaylist': True,
             'restrictfilenames': True,
@@ -179,7 +185,7 @@ class DelojzaBot:
         with youtube_dl.YoutubeDL(ydl_opts) as ydl:
             ydl.download(urls)
             for info in [ydl.extract_info(url, download=False) for url in urls]:
-                filename = ydl.prepare_filename(info)
+                filename = self.sanitize(ydl.prepare_filename(info))
                 globbeds = glob(os.path.splitext(filename)[0] + '.*')
                 for globbed in globbeds:
                     if globbed.endswith("mp3"):
@@ -189,10 +195,11 @@ class DelojzaBot:
                     filenames.append(dest)
         return filenames
 
-    def download_raw(self, urls, out_path, date, message, audio=False, filename=None):
+    def download_raw(self, urls, out_path, date, message, audio=False, filetitle=None):
         filenames = []
         for url in urls:
-            local_filename = os.path.join(out_path, "%s__%s" % (datestr(date), filename or url.split('/')[-1]))
+            local_filename = os.path.join(out_path, "{}__{}".format(datestr(date),
+                                                                    self.sanitize(filetitle or url.split('/')[-1])))
             final_filename = local_filename
             is_mp3 = local_filename.endswith("mp3")
 
@@ -255,7 +262,7 @@ class DelojzaBot:
             self.last_hashtags = update.message.from_user, update.message.chat, datetime.now(), hashtags
 
     # noinspection PyBroadException
-    def handle(self, urls, message, hashtags, download_fn, filename=None):
+    def handle(self, urls, message, hashtags, download_fn, filetitle=None):
         try:
             if len(hashtags) == 0:
                 self.logger.info("Ignoring %s due to no hashtag present..." % urls)
@@ -281,7 +288,7 @@ class DelojzaBot:
                 reply += ' (And also guessing you want to extract the audio)'
             message.reply_text(reply)
 
-            filenames = download_fn(urls, out_path, message.date, message, audio=audio, filename=filename)
+            filenames = download_fn(urls, out_path, message.date, message, audio=audio, filetitle=filetitle)
 
             cmd_hashtag = hashtags[0]
 
@@ -290,8 +297,8 @@ class DelojzaBot:
                 now = cmd_hashtag == 'TUMBLR_NOW'
                 reply = '(btw, {})'.format("***FIRING TO TUMBLR RIGHT AWAY***" if now else "queueing to tumblr")
                 message.reply_text(reply, parse_mode=telegram.ParseMode.MARKDOWN)
-                for filename in filenames:
-                    response = self.tumblr_client.create_photo(self.tumblr_name, data=filename,
+                for filetitle in filenames:
+                    response = self.tumblr_client.create_photo(self.tumblr_name, data=filetitle,
                                                                state="published" if now else "queue")
                     if 'id' in response:
                         tumblr_ids.append(response['id'])
@@ -327,15 +334,15 @@ class DelojzaBot:
         self.handle_tg_message(update.message, bot, self.get_hashtags(update.message))
 
     def handle_tg_message(self, message, bot, hashtag):
-        file, filename, tumblr = None, None, False
+        file, filetitle, tumblr = None, None, False
         if len(message.photo) > 0:
             photo = max(message.photo, key=lambda p: p.width)
             file = photo.file_id
         elif message.document is not None:
-            filename = message.document.file_name
+            filetitle = message.document.file_name
             file = message.document.file_id
         elif message.audio is not None:
-            filename = message.audio.title
+            filetitle = message.audio.title
             file = message.audio.file_id
         elif message.video is not None:
             file = message.video.file_id
@@ -346,7 +353,7 @@ class DelojzaBot:
 
         if file is not None:
             url = bot.getFile(file).file_path
-            self.handle([url], message, hashtag, self.download_raw, filename=filename)
+            self.handle([url], message, hashtag, self.download_raw, filetitle=filetitle)
 
     def tg_handle_text(self, _, update):
         if self.markov: