4 lat temu · f26b3b6ed9
--- a/calibre/recipe/oriental_daily_pure.recipe
+++ b/calibre/recipe/oriental_daily_pure.recipe
@@ -0,0 +1,156 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import string, re
			
 
				+#import HTMLParser 
			
 
				+from html.parser import HTMLParser
			
 
				+from calibre import strftime
			
 
				+from calibre.web.feeds.recipes import BasicNewsRecipe
			
 
				+from calibre.ebooks.BeautifulSoup import BeautifulSoup
			
 
				+
			
 
				+class OrientalDailyPure(BasicNewsRecipe):
			
 
				+
			
 
				+    title       = 'Oriental Daily - '  + time.strftime('%d %b %Y')
			
 
				+    __author__  = 'Larry Chan'
			
 
				+    description = 'Oriental Daily, Hong Kong'
			
 
				+    publication_type = 'newspaper'
			
 
				+    language    = 'zh'
			
 
				+    timefmt = ' [%a, %d %b, %Y]'
			
 
				+    masthead_url = 'http://orientaldaily.on.cc/img/v2/logo_odn.png'
			
 
				+    #cover_url = 'http://orientaldaily.on.cc/cnt/news/' + time.strftime('%Y%m%d') + '/photo/' + time.strftime('%m%d') + '-00174-001k1.jpg'
			
 
				+    cover_url = 'https://orientaldaily.on.cc/asset/main/%s/photo/337_sectMain.jpg' % time.strftime('%Y%m%d')
			
 
				+    #print ("cover %s" % cover_url)
			
 
				+    delay = 0
			
 
				+
			
 
				+    no_stylesheets = True
			
 
				+    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
			
 
				+   
			
 
				+
			
 
				+#    keep_only_tags    = [
			
 
				+#                       dict(name='h1'),
			
 
				+#                       dict(name='a'),                                  
			
 
				+#                       dict(name='img'),                                  
			
 
				+#                       dict(name='div'),                                  
			
 
				+#                       dict(attrs={'div': 'content'})                                  
			
 
				+#                        ]
			
 
				+
			
 
				+    #dict(name='p', attrs={'class':['photoCaption','paragraph']})
			
 
				+    #remove_tags = [dict(name=['script', 'input'])]
			
 
				+    HTMLParser.attrfind = re.compile(
			
 
				+                        r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
			
 
				+                        r'(\'[^\']*\'|"[^"]*"|[^\s>^\[\]{}\|\'\"]*))?') 
			
 
				+    
			
 
				+
			
 
				+
			
 
				+
			
 
				+    
			
 
				+    def parse_index(self):
			
 
				+
			
 
				+
			
 
				+       	def extract_text(tag):
			
 
				+            return str(tag.contents[0]).replace('<em>', '').replace('</em>', '')
			
 
				+ 
			
 
				+
			
 
				+
			
 
				+        def old_scrap_feed(feed):
			
 
				+            f_url = '%s%s' % (urlRoot, feed[0])
			
 
				+            print (f_url)
			
 
				+            #f_html = urlopen(f_url).read()
			
 
				+            sf = self.index_to_soup(f_url)
			
 
				+            # extract all h2 headlines
			
 
				+            l_h2 = map(lambda x: extract_text(x), sf.findAll('h2'))
			
 
				+            l_h2 = list(l_h2)[:len(list(l_h2))-2]
			
 
				+            
			
 
				+            # for each headline, look for the feed title and feed url
			
 
				+             
			
 
				+#	    print '--------------------'
			
 
				+            print ('LEVEL H2: %s' % l_h2)
			
 
				+#            for hl in l_h2:
			
 
				+#                print 'h1: ' + hl
			
 
				+#                print sf.findAll('ul', {'title': hl })[0].findAll('li')
			
 
				+            l_feed = map(lambda x: sf.findAll('ul', {'title': x })[0].findAll('li'), l_h2)
			
 
				+            print ('-----------l_feed')
			
 
				+            l_feed = [item  for sublist in l_feed for item in sublist]
			
 
				+            print (l_feed[0])
			
 
				+            print ('------END -----l_feed')
			
 
				+#            l_feed = map(lambda x: {'url': '%s%s' % (urlRoot, x[0].a['href']), 'title': extract_text(x[0].a),
			
 
				+#                                    'date': strftime('%a, %d %b'), 
			
 
				+#                                    'description': '%s' % (extract_text(x[0].a)),
			
 
				+#                                    'content': ''}, l_feed)
			
 
				+            l_feed = map(lambda x: {'url': '%s%s' % (urlRoot, x.a['href']), 'title': extract_text(x.a),
			
 
				+                                    'date': strftime('%a, %d %b'), 
			
 
				+                                    'description': '%s' % (extract_text(x.a)),
			
 
				+                                    'content': ''}, l_feed)
			
 
				+            print ('****************************')
			
 
				+            
			
 
				+            l_feed = list(l_feed)
			
 
				+            print (list(l_feed))
			
 
				+            return l_feed
			
 
				+        
			
 
				+        def scrap_feed(feed):
			
 
				+            f_url = '%s%s' % (urlRoot, feed[0])
			
 
				+            print (f_url)
			
 
				+            soup = self.index_to_soup(f_url)
			
 
				+            articles = soup.findAll('div', 'sectionList')[0].findAll('li')
			
 
				+            articles = map(lambda x:{'url': '%s%s' % (urlRoot, x.a['href']), 
			
 
				+                            'title': x.findAll('div', attrs={'class' : 'text'})[0].text, 
			
 
				+                            'date': strftime('%a, %d %b'),
			
 
				+                            'description': x.findAll('div', attrs={'class' : 'text'})[0].text,
			
 
				+                            'content': ''}, articles)
			
 
				+            ans = []
			
 
				+            for article in articles:
			
 
				+                ans.append(article)    
			
 
				+            return ans 
			
 
				+               
			
 
				+
			
 
				+        urlRoot = 'https://orientaldaily.on.cc'
			
 
				+        #url = '%s/cnt/news/%s/index.html' % (urlRoot, time.strftime('%Y%m%d'))
			
 
				+        url = urlRoot 
			
 
				+        #url = '%s/cnt/news/%s/index.html' % (urlRoot, '20201127')
			
 
				+        soup = self.index_to_soup(url)
			
 
				+        #lookups = ['要聞港聞','兩岸國際','財經','娛樂','副刊','男極圈','體育','馬經','波經','社論專欄','慈善基金','昔日東方']
			
 
				+        lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport']
			
 
				+        # no finanical news on Sunday
			
 
				+        if time.strftime('%w') == '0':
			
 
				+           lookups.remove('finance') 
			
 
				+
			
 
				+        feeds = soup.findAll('ul', 'menuList clear')[0].findAll('li', attrs={'section':lookups})
			
 
				+        feeds = map(lambda x: (x.a['href'], x.text), feeds)
			
 
				+        feeds = list(feeds)
			
 
				+
			
 
				+        print ('----------------------- The feeds are: %s' % feeds)
			
 
				+        ans = []
			
 
				+        for e in feeds:
			
 
				+            print ('e[1] is: %s | %s\n' % (e[1], e[0]))
			
 
				+            ans.append((e[1], scrap_feed(e)))
			
 
				+        print ('############')
			
 
				+        print (ans)
			
 
				+        return ans
			
 
				+  
			
 
				+
			
 
				+
			
 
				+
			
 
				+    
			
 
				+
			
 
				+    def preprocess_html(self, soup):
			
 
				+         
			
 
				+        print('((((( begin article ))))')
			
 
				+        try:
			
 
				+            #print(soup)
			
 
				+            html = str(soup.find('h1'))  + ''.join(str(t) for t in soup.findAll('div', 'content'))	
			
 
				+            # download photo
			
 
				+            pic = soup.find('div', 'paragraph photoParagraph')
			
 
				+            #print (pic)
			
 
				+            if pic != None:
			
 
				+               html += '<a href="%s"><img src="%s"></img></a>' % (str(pic.a['href']), str(pic.img['src'])) 
			
 
				+            print('>>>>>>>>>>>>>>> %s' % html)
			
 
				+            return BeautifulSoup(html) 
			
 
				+        except Exception as e:
			
 
				+            print (e)
			
 
				+            print('other article...')	
			
 
				+        print('((((( end article ))))')
			
 
				+        return soup 
			
 
				+
			
 
				+
			
 
				+    def get_browser(self, *args, **kwargs):
			
 
				+        br = BasicNewsRecipe.get_browser(self)
			
 
				+        br.set_header('User-Agent', value='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36') 
			
 
				+        return br