ghibili4.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import json
  4. import requests
  5. def collect_chinese_titles_and_paths(root_dir):
  6. """Traverse the directory tree and collect all unique Chinese movie titles (directory names) and their paths,
  7. excluding directories containing certain keywords. Also collect .mkv and .mp4 files (even with unwanted trailing chars),
  8. but ignore files containing '国语' in their filename."""
  9. exclude_keywords = ['删除文件', '默认日语', '切换语言']
  10. titles = []
  11. paths = {}
  12. media_files = {}
  13. for dirpath, dirnames, filenames in os.walk(root_dir):
  14. for dirname in dirnames:
  15. if any(keyword in dirname for keyword in exclude_keywords):
  16. continue
  17. full_path = os.path.join(dirpath, dirname)
  18. titles.append(dirname)
  19. paths[dirname] = full_path
  20. # Collect .mkv and .mp4 files, allowing for unwanted trailing chars (e.g., .mkv1, .mp41)
  21. try:
  22. files_in_dir = os.listdir(full_path)
  23. media_files[dirname] = [
  24. os.path.join(full_path, f)
  25. for f in files_in_dir
  26. if (
  27. (f.lower().endswith(('.mkv', '.mp4')) or
  28. f.lower().endswith(('.mkv1', '.mp41')) or
  29. any(f.lower().endswith(ext + str(i)) for ext in ['.mkv', '.mp4'] for i in range(1, 10)))
  30. and '国语' not in f
  31. and '720P' not in f
  32. )
  33. ]
  34. except Exception as e:
  35. media_files[dirname] = []
  36. return titles, paths, media_files
  37. def match_english_title(chinese_title):
  38. """
  39. Match the Chinese title to an English title.
  40. This can use a hard-coded mapping or call DeepSeek API for translation.
  41. """
  42. hardcoded = {
  43. "2013-辉夜姬物语": "The Tale of the Princess Kaguya",
  44. "2004-哈尔移动城堡": "Howl's Moving Castle",
  45. "1995-侧耳倾听": "Whisper of the Heart",
  46. "1992-飞天红猪侠": "Porco Rosso",
  47. "1988-萤火虫之墓": "Grave of the Fireflies",
  48. "1989-魔女宅急便": "Kiki's Delivery Service",
  49. "1994-百变狸猫": "Pom Poko",
  50. "1984-风之谷": "Nausicaä of the Valley of the Wind",
  51. "1979-鲁邦三世 卡里奥斯特罗城": "Lupin III: The Castle of Cagliostro",
  52. "2011-虞美人盛开的山坡": "From Up on Poppy Hill",
  53. "2008-《悬崖上的金鱼姬》《崖上的波妞》": "Ponyo",
  54. "2006-地海战记": "Tales from Earthsea",
  55. "2010-借东西的小矮人亚莉亚蒂": "The Secret World of Arrietty",
  56. "2002-猫的报恩": "The Cat Returns",
  57. "1999-我的邻居山田君": "My Neighbors the Yamadas",
  58. "1988-龙猫": "My Neighbor Totoro",
  59. "2001-千Yu千寻": "Spirited Away",
  60. "1993-听到涛声": "Ocean Waves",
  61. "2014-记忆中的玛妮": "When Marnie Was There",
  62. "1991-岁月的童话": "Only Yesterday",
  63. "1997-幽灵公主": "Princess Mononoke",
  64. "2016-红海龟": "The Red Turtle",
  65. "2013-起风了": "The Wind Rises"
  66. }
  67. if chinese_title in hardcoded:
  68. return hardcoded[chinese_title]
  69. # Optionally add DeepSeek API translation here
  70. return None # No match found
  71. def search_imdb_id(english_title):
  72. """Search IMDb for the movie and return the IMDb ID."""
  73. if not english_title:
  74. return None
  75. params = {"q": english_title, "s": "tt", "ttype": "ft", "ref_": "fn_ft"}
  76. headers = {
  77. "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
  78. }
  79. try:
  80. resp = requests.get("https://www.imdb.com/find", params=params, headers=headers, timeout=10)
  81. resp.raise_for_status()
  82. import re
  83. matches = re.findall(r'/title/(tt\d+)/', resp.text)
  84. if matches:
  85. return matches[0]
  86. except Exception as e:
  87. print(f"IMDb search error for {english_title}: {e}")
  88. return None
  89. def search_tmdb_id(english_title, api_key):
  90. """Search TMDB for the movie and return the TMDB ID and IMDb ID if available."""
  91. if not english_title or not api_key:
  92. return None, None
  93. url = "https://api.themoviedb.org/3/search/movie"
  94. params = {
  95. "api_key": api_key,
  96. "query": english_title,
  97. "language": "en-US"
  98. }
  99. try:
  100. resp = requests.get(url, params=params, timeout=10)
  101. resp.raise_for_status()
  102. data = resp.json()
  103. if data.get("results"):
  104. movie = data["results"][0]
  105. tmdb_id = movie.get("id")
  106. imdb_id = None
  107. # Try to get IMDb ID from TMDB details
  108. details_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
  109. details_params = {"api_key": api_key}
  110. details_resp = requests.get(details_url, params=details_params, timeout=10)
  111. if details_resp.ok:
  112. imdb_id = details_resp.json().get("imdb_id")
  113. return tmdb_id, imdb_id
  114. except Exception as e:
  115. print(f"TMDB search error for {english_title}: {e}")
  116. return None, None
  117. def main(root_dir, use_tmdb=False, tmdb_api_key=None):
  118. chinese_titles, dir_paths, media_files = collect_chinese_titles_and_paths(root_dir)
  119. results = []
  120. unmatched = []
  121. print(f"Found {len(chinese_titles)} unique Chinese titles.")
  122. for chinese_title in chinese_titles:
  123. english_title = match_english_title(chinese_title)
  124. imdb_id = None
  125. tmdb_id = None
  126. dir_path = dir_paths.get(chinese_title, "")
  127. media_list = media_files.get(chinese_title, [])
  128. # If using TMDB, search by the second token (Chinese title) for better accuracy
  129. if use_tmdb and tmdb_api_key:
  130. try:
  131. search_str = chinese_title.split("-", 1)[1]
  132. except IndexError:
  133. search_str = chinese_title
  134. print(f"Using TMDB API for ID search for title {search_str}")
  135. tmdb_id, imdb_id, fetched_english_title = None, None, None
  136. url = "https://api.themoviedb.org/3/search/movie"
  137. params = {
  138. "api_key": tmdb_api_key,
  139. "query": search_str,
  140. "language": "zh-CN"
  141. }
  142. try:
  143. resp = requests.get(url, params=params, timeout=10)
  144. resp.raise_for_status()
  145. data = resp.json()
  146. if data.get("results"):
  147. movie = data["results"][0]
  148. tmdb_id = movie.get("id")
  149. # Try to get English title from TMDB details
  150. details_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
  151. details_params = {"api_key": tmdb_api_key, "language": "en-US"}
  152. details_resp = requests.get(details_url, params=details_params, timeout=10)
  153. if details_resp.ok:
  154. details = details_resp.json()
  155. fetched_english_title = details.get("title")
  156. imdb_id = details.get("imdb_id")
  157. except Exception as e:
  158. print(f"TMDB search error for {search_str}: {e}")
  159. else:
  160. imdb_id = search_imdb_id(english_title)
  161. fetched_english_title = english_title
  162. # Prefer fetched English title if available
  163. final_english_title = fetched_english_title if fetched_english_title else english_title
  164. if final_english_title and (imdb_id or tmdb_id):
  165. entry = {
  166. "chinese": chinese_title,
  167. "english": final_english_title,
  168. "path": dir_path,
  169. "media_files": media_list
  170. }
  171. if imdb_id:
  172. entry["imdb_id"] = imdb_id
  173. if tmdb_id:
  174. entry["tmdb_id"] = tmdb_id
  175. results.append(entry)
  176. else:
  177. unmatched.append({
  178. "chinese": chinese_title,
  179. "english": final_english_title,
  180. "imdb_id": imdb_id if imdb_id else None,
  181. "tmdb_id": tmdb_id if tmdb_id else None,
  182. "path": dir_path,
  183. "media_files": media_list,
  184. "reason": "No English match" if not final_english_title else "No ID match"
  185. })
  186. print(f"Matched: {len(results)}")
  187. print(f"Unmatched: {len(unmatched)}")
  188. with open("ghibli_imdb_results.json", "w", encoding="utf-8") as f:
  189. json.dump({"matched": results, "unmatched": unmatched}, f, ensure_ascii=False, indent=2)
  190. if __name__ == "__main__":
  191. import sys
  192. # Usage: python ghibili.py <directory> [--tmdb TMDB_API_KEY]
  193. # Example: python3 ghibili4.py /media/yazoo/luks-67672a15-a412-4a17-bb01-c76509e21243/crm/crm-media/anime/赠品:宫崎骏+新海城动画 --tmdb 36dfc5c362f731b9b777e1f30028fbb0
  194. use_tmdb = False
  195. tmdb_api_key = '36dfc5c362f731b9b777e1f30028fbb0'
  196. if len(sys.argv) < 2:
  197. print("Usage: python ghiblil.py <directory> [--tmdb TMDB_API_KEY]")
  198. else:
  199. root_dir = sys.argv[1]
  200. if len(sys.argv) >= 4 and sys.argv[2] == "--tmdb":
  201. use_tmdb = True
  202. tmdb_api_key = sys.argv[3]
  203. main(root_dir, use_tmdb, tmdb_api_key)