|
@@ -0,0 +1,135 @@
|
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
|
+import string, re
|
|
|
|
|
+#import HTMLParser
|
|
|
|
|
+from html.parser import HTMLParser
|
|
|
|
|
+from calibre import strftime
|
|
|
|
|
+from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
|
|
|
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
|
|
|
+
|
|
|
|
|
+class OrientalDailyPure(BasicNewsRecipe):
|
|
|
|
|
+
|
|
|
|
|
+ title = 'Oriental Daily - ' + time.strftime('%d %b %Y')
|
|
|
|
|
+ __author__ = 'Larry Chan'
|
|
|
|
|
+ description = 'Oriental Daily, Hong Kong'
|
|
|
|
|
+ publication_type = 'newspaper'
|
|
|
|
|
+ language = 'zh'
|
|
|
|
|
+ timefmt = ' [%a, %d %b, %Y]'
|
|
|
|
|
+ masthead_url = 'http://orientaldaily.on.cc/img/v2/logo_odn.png'
|
|
|
|
|
+ #cover_url = 'http://orientaldaily.on.cc/cnt/news/' + time.strftime('%Y%m%d') + '/photo/' + time.strftime('%m%d') + '-00174-001k1.jpg'
|
|
|
|
|
+ cover_url = 'https://orientaldaily.on.cc/asset/main/%s/photo/337_sectMain.jpg' % time.strftime('%Y%m%d')
|
|
|
|
|
+ #print ("cover %s" % cover_url)
|
|
|
|
|
+ delay = 0
|
|
|
|
|
+
|
|
|
|
|
+ no_stylesheets = True
|
|
|
|
|
+ extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# keep_only_tags = [
|
|
|
|
|
+# dict(name='h1'),
|
|
|
|
|
+# dict(name='a'),
|
|
|
|
|
+# dict(name='img'),
|
|
|
|
|
+# dict(name='div'),
|
|
|
|
|
+# dict(attrs={'div': 'content'})
|
|
|
|
|
+# ]
|
|
|
|
|
+
|
|
|
|
|
+ #dict(name='p', attrs={'class':['photoCaption','paragraph']})
|
|
|
|
|
+ #remove_tags = [dict(name=['script', 'input'])]
|
|
|
|
|
+ HTMLParser.attrfind = re.compile(
|
|
|
|
|
+ r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
|
|
|
|
|
+ r'(\'[^\']*\'|"[^"]*"|[^\s>^\[\]{}\|\'\"]*))?')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ def parse_index(self):
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ def extract_text(tag):
|
|
|
|
|
+ return str(tag.contents[0]).replace('<em>', '').replace('</em>', '')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ def scrap_feed(feed):
|
|
|
|
|
+ f_url = '%s%s' % (urlRoot, feed[0])
|
|
|
|
|
+ print ('feed url %s ' % f_url)
|
|
|
|
|
+ soup = self.index_to_soup(f_url)
|
|
|
|
|
+ # verify a section is available for download on the day this script is run.
|
|
|
|
|
+ # skip a section if unavailable
|
|
|
|
|
+ # for instance, finance section is unavailable on Sunday, so is "lifestyle"
|
|
|
|
|
+ try:
|
|
|
|
|
+ articles = soup.findAll('div', 'sectionList')[0].findAll('li')
|
|
|
|
|
+ except:
|
|
|
|
|
+ print ('--- this section [%s] is not available today ---' % feed[1])
|
|
|
|
|
+ raise Exception ('--- this section [%s] is not available today ---' % feed[1])
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ articles = map(lambda x:{'url': '%s%s' % (urlRoot, x.a['href']),
|
|
|
|
|
+ 'title': x.findAll('div', attrs={'class' : 'text'})[0].text,
|
|
|
|
|
+ 'date': strftime('%a, %d %b'),
|
|
|
|
|
+ 'description': x.findAll('div', attrs={'class' : 'text'})[0].text,
|
|
|
|
|
+ 'content': ''}, articles)
|
|
|
|
|
+ ans = []
|
|
|
|
|
+ for article in articles:
|
|
|
|
|
+ ans.append(article)
|
|
|
|
|
+ return ans
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ urlRoot = 'https://orientaldaily.on.cc'
|
|
|
|
|
+ url = urlRoot
|
|
|
|
|
+ soup = self.index_to_soup(url)
|
|
|
|
|
+ #lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport']
|
|
|
|
|
+ lookups = ['news', 'china_world', 'finance', 'entertainment', 'lifestyle', 'adult', 'sport']
|
|
|
|
|
+ # no finanical news on Sunday
|
|
|
|
|
+ #if time.strftime('%w') == '0':
|
|
|
|
|
+ # lookups.remove('finance')
|
|
|
|
|
+
|
|
|
|
|
+ feeds = soup.findAll('ul', 'menuList clear')[0].findAll('li', attrs={'section':lookups})
|
|
|
|
|
+ feeds = map(lambda x: (x.a['href'], x.text), feeds)
|
|
|
|
|
+ feeds = list(feeds)
|
|
|
|
|
+
|
|
|
|
|
+ print ('----------------------- The feeds are: %s' % feeds)
|
|
|
|
|
+ ans = []
|
|
|
|
|
+ for e in feeds:
|
|
|
|
|
+ try:
|
|
|
|
|
+ print ('e[1] is: %s | %s\n' % (e[1], e[0]))
|
|
|
|
|
+ ans.append((e[1], scrap_feed(e)))
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print('while processing feed: %s' % e)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ print ('############')
|
|
|
|
|
+ print (ans)
|
|
|
|
|
+ return ans
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ def preprocess_html(self, soup):
|
|
|
|
|
+
|
|
|
|
|
+ print('((((( begin article ))))')
|
|
|
|
|
+ try:
|
|
|
|
|
+ #print(soup)
|
|
|
|
|
+ html = str(soup.find('h1')) + ''.join(str(t) for t in soup.findAll('div', 'content'))
|
|
|
|
|
+ # download photo
|
|
|
|
|
+ pic = soup.find('div', 'paragraph photoParagraph')
|
|
|
|
|
+ #print (pic)
|
|
|
|
|
+ if pic != None:
|
|
|
|
|
+ html += '<a href="%s"><img src="%s"></img></a>' % (str(pic.a['href']), str(pic.img['src']))
|
|
|
|
|
+ #print('>>>>>>>>>>>>>>> %s' % html)
|
|
|
|
|
+ return BeautifulSoup(html)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print (e)
|
|
|
|
|
+ print('other article...')
|
|
|
|
|
+ print('((((( end article ))))')
|
|
|
|
|
+ return soup
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ def get_browser(self, *args, **kwargs):
|
|
|
|
|
+ br = BasicNewsRecipe.get_browser(self)
|
|
|
|
|
+ br.set_header('User-Agent', value='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
|
|
|
|
|
+ return br
|