discard_media.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. import os
  2. import sys
  3. def find_small_media_files(root_dir, max_size_mb=50, output_file="small_media_files.txt"):
  4. import re
  5. max_size_bytes = max_size_mb * 1024 * 1024
  6. media_exts = ('.mkv', '.mp4', '.mht', '.txt', '.url', '.gif', '.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm')
  7. found = []
  8. for dirpath, _, filenames in os.walk(root_dir):
  9. for fname in filenames:
  10. fpath = os.path.join(dirpath, fname)
  11. # Check for extension, size, or pattern match
  12. matches_ext_and_size = (
  13. fname.lower().endswith(media_exts) and
  14. os.path.getsize(fpath) < max_size_bytes
  15. )
  16. matches_pattern = (
  17. fname.endswith("__") or
  18. "padding" in fname.lower()
  19. )
  20. try:
  21. if matches_ext_and_size or matches_pattern:
  22. found.append(fpath)
  23. except Exception:
  24. continue
  25. with open(output_file, "w", encoding="utf-8") as f:
  26. for path in found:
  27. f.write(path + "\n")
  28. print(f"Found {len(found)} files matching criteria. Results saved to {output_file}")
  29. if __name__ == "__main__":
  30. if len(sys.argv) < 2:
  31. print("Usage: python find_small_media.py <directory> [max_size_mb] [output_file]")
  32. else:
  33. root = sys.argv[1]
  34. max_mb = int(sys.argv[2]) if len(sys.argv) > 2 else 1
  35. out_file = sys.argv[3] if len(sys.argv) > 3 else "small_media_files.txt"
  36. find_small_media_files(root, max_mb, out_file)