浏览代码

handle missing sections in recipe

larry 4 年之前
父节点
当前提交
546a15ac3e
共有 2 个文件被更改,包括 25 次插入51 次删除
  1. 25 46
      calibre/recipe/oriental_daily_pure.recipe
  2. 0 5
      calibre/recipe/upkindle.sh

+ 25 - 46
calibre/recipe/oriental_daily_pure.recipe

@@ -49,47 +49,23 @@ class OrientalDailyPure(BasicNewsRecipe):
             return str(tag.contents[0]).replace('<em>', '').replace('</em>', '')
  
 
-
-        def old_scrap_feed(feed):
-            f_url = '%s%s' % (urlRoot, feed[0])
-            print (f_url)
-            #f_html = urlopen(f_url).read()
-            sf = self.index_to_soup(f_url)
-            # extract all h2 headlines
-            l_h2 = map(lambda x: extract_text(x), sf.findAll('h2'))
-            l_h2 = list(l_h2)[:len(list(l_h2))-2]
-            
-            # for each headline, look for the feed title and feed url
-             
-#	    print '--------------------'
-            print ('LEVEL H2: %s' % l_h2)
-#            for hl in l_h2:
-#                print 'h1: ' + hl
-#                print sf.findAll('ul', {'title': hl })[0].findAll('li')
-            l_feed = map(lambda x: sf.findAll('ul', {'title': x })[0].findAll('li'), l_h2)
-            print ('-----------l_feed')
-            l_feed = [item  for sublist in l_feed for item in sublist]
-            print (l_feed[0])
-            print ('------END -----l_feed')
-#            l_feed = map(lambda x: {'url': '%s%s' % (urlRoot, x[0].a['href']), 'title': extract_text(x[0].a),
-#                                    'date': strftime('%a, %d %b'), 
-#                                    'description': '%s' % (extract_text(x[0].a)),
-#                                    'content': ''}, l_feed)
-            l_feed = map(lambda x: {'url': '%s%s' % (urlRoot, x.a['href']), 'title': extract_text(x.a),
-                                    'date': strftime('%a, %d %b'), 
-                                    'description': '%s' % (extract_text(x.a)),
-                                    'content': ''}, l_feed)
-            print ('****************************')
-            
-            l_feed = list(l_feed)
-            print (list(l_feed))
-            return l_feed
         
         def scrap_feed(feed):
             f_url = '%s%s' % (urlRoot, feed[0])
-            print (f_url)
+            print ('feed url %s ' % f_url)
             soup = self.index_to_soup(f_url)
-            articles = soup.findAll('div', 'sectionList')[0].findAll('li')
+            # verify a section is available for download on the day this script is run.
+            # skip a section if unavailable   
+            # for instance, finance section is unavailable on Sunday, so is "lifestyle"
+            try:
+               articles = soup.findAll('div', 'sectionList')[0].findAll('li')
+            except:
+               print ('--- this section [%s] is not available today ---' % feed[1]) 
+               raise Exception ('--- this section [%s] is not available today ---' % feed[1]) 
+
+
+ 		
+
             articles = map(lambda x:{'url': '%s%s' % (urlRoot, x.a['href']), 
                             'title': x.findAll('div', attrs={'class' : 'text'})[0].text, 
                             'date': strftime('%a, %d %b'),
@@ -102,15 +78,13 @@ class OrientalDailyPure(BasicNewsRecipe):
                
 
         urlRoot = 'https://orientaldaily.on.cc'
-        #url = '%s/cnt/news/%s/index.html' % (urlRoot, time.strftime('%Y%m%d'))
         url = urlRoot 
-        #url = '%s/cnt/news/%s/index.html' % (urlRoot, '20201127')
         soup = self.index_to_soup(url)
-        #lookups = ['要聞港聞','兩岸國際','財經','娛樂','副刊','男極圈','體育','馬經','波經','社論專欄','慈善基金','昔日東方']
-        lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport']
+        #lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport']
+        lookups = ['news', 'china_world', 'finance', 'entertainment', 'lifestyle', 'adult', 'sport']
         # no finanical news on Sunday
-        if time.strftime('%w') == '0':
-           lookups.remove('finance') 
+        #if time.strftime('%w') == '0':
+        #   lookups.remove('finance') 
 
         feeds = soup.findAll('ul', 'menuList clear')[0].findAll('li', attrs={'section':lookups})
         feeds = map(lambda x: (x.a['href'], x.text), feeds)
@@ -119,8 +93,13 @@ class OrientalDailyPure(BasicNewsRecipe):
         print ('----------------------- The feeds are: %s' % feeds)
         ans = []
         for e in feeds:
-            print ('e[1] is: %s | %s\n' % (e[1], e[0]))
-            ans.append((e[1], scrap_feed(e)))
+            try:
+               print ('e[1] is: %s | %s\n' % (e[1], e[0]))
+               ans.append((e[1], scrap_feed(e)))
+            except Exception as e:
+               print('while processing feed: %s' % e) 
+               continue
+
         print ('############')
         print (ans)
         return ans
@@ -141,7 +120,7 @@ class OrientalDailyPure(BasicNewsRecipe):
             #print (pic)
             if pic != None:
                html += '<a href="%s"><img src="%s"></img></a>' % (str(pic.a['href']), str(pic.img['src'])) 
-            print('>>>>>>>>>>>>>>> %s' % html)
+            #print('>>>>>>>>>>>>>>> %s' % html)
             return BeautifulSoup(html) 
         except Exception as e:
             print (e)

+ 0 - 5
calibre/recipe/upkindle.sh

@@ -5,11 +5,6 @@ RECIPEPATH=$ROOTPATH/recipe
 MOBIPATH=$ROOTPATH/daily_news/oriental
 OPTIONS="--output-profile kindle_pw"
 echo $RECIPEPATH
-#
-# download oriental daily
-#
-#ebook-convert "$RECIPEPATH/oriental_daily_pure.recipe" $MOBIPATH/$TODAY-orient.mobi $OPTIONS
-
 #
 #  download news and save output to epub
 #