[utils] Sanitize look-alike Unicode glyphs in non-ID filename fields when --restrict-filenames

Implements https://github.com/ytdl-org/youtube-dl/issues/31216#issuecomment-1236102822, which has a test.
pull/31304/head
dirkf 2022-10-11 12:18:12 +00:00 committed by GitHub
parent 6e2626f092
commit c94a459a24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 4 additions and 0 deletions

View File

@ -33,6 +33,7 @@ import sys
import tempfile import tempfile
import time import time
import traceback import traceback
import unicodedata
import xml.etree.ElementTree import xml.etree.ElementTree
import zlib import zlib
@ -2118,6 +2119,9 @@ def sanitize_filename(s, restricted=False, is_id=False):
return '_' return '_'
return char return char
# Replace look-alike Unicode glyphs
if restricted and not is_id:
s = unicodedata.normalize('NFKC', s)
# Handle timestamps # Handle timestamps
s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
result = ''.join(map(replace_insane, s)) result = ''.join(map(replace_insane, s))