oriental_daily_pure.recipe 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. # -*- coding: utf-8 -*-
  2. import string, re
  3. #import HTMLParser
  4. from html.parser import HTMLParser
  5. from calibre import strftime
  6. from calibre.web.feeds.recipes import BasicNewsRecipe
  7. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  8. class OrientalDailyPure(BasicNewsRecipe):
  9. title = 'Oriental Daily - ' + time.strftime('%d %b %Y')
  10. __author__ = 'Larry Chan'
  11. description = 'Oriental Daily, Hong Kong'
  12. publication_type = 'newspaper'
  13. language = 'zh'
  14. timefmt = ' [%a, %d %b, %Y]'
  15. masthead_url = 'http://orientaldaily.on.cc/img/v2/logo_odn.png'
  16. #cover_url = 'http://orientaldaily.on.cc/cnt/news/' + time.strftime('%Y%m%d') + '/photo/' + time.strftime('%m%d') + '-00174-001k1.jpg'
  17. cover_url = 'https://orientaldaily.on.cc/asset/main/%s/photo/337_sectMain.jpg' % time.strftime('%Y%m%d')
  18. #print ("cover %s" % cover_url)
  19. delay = 0
  20. no_stylesheets = True
  21. extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
  22. # keep_only_tags = [
  23. # dict(name='h1'),
  24. # dict(name='a'),
  25. # dict(name='img'),
  26. # dict(name='div'),
  27. # dict(attrs={'div': 'content'})
  28. # ]
  29. #dict(name='p', attrs={'class':['photoCaption','paragraph']})
  30. #remove_tags = [dict(name=['script', 'input'])]
  31. HTMLParser.attrfind = re.compile(
  32. r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  33. r'(\'[^\']*\'|"[^"]*"|[^\s>^\[\]{}\|\'\"]*))?')
  34. def parse_index(self):
  35. def extract_text(tag):
  36. return str(tag.contents[0]).replace('<em>', '').replace('</em>', '')
  37. def scrap_feed(feed):
  38. f_url = '%s%s' % (urlRoot, feed[0])
  39. print ('feed url %s ' % f_url)
  40. soup = self.index_to_soup(f_url)
  41. # verify a section is available for download on the day this script is run.
  42. # skip a section if unavailable
  43. # for instance, finance section is unavailable on Sunday, so is "lifestyle"
  44. try:
  45. articles = soup.findAll('div', 'sectionList')[0].findAll('li')
  46. except:
  47. print ('--- this section [%s] is not available today ---' % feed[1])
  48. raise Exception ('--- this section [%s] is not available today ---' % feed[1])
  49. articles = map(lambda x:{'url': '%s%s' % (urlRoot, x.a['href']),
  50. 'title': x.findAll('div', attrs={'class' : 'text'})[0].text,
  51. 'date': strftime('%a, %d %b'),
  52. 'description': x.findAll('div', attrs={'class' : 'text'})[0].text,
  53. 'content': ''}, articles)
  54. ans = []
  55. for article in articles:
  56. ans.append(article)
  57. return ans
  58. urlRoot = 'https://orientaldaily.on.cc'
  59. url = urlRoot
  60. print ('############ before soup')
  61. soup = self.index_to_soup(url)
  62. print ('############ after soup')
  63. #lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport']
  64. lookups = ['news', 'china_world', 'finance', 'entertainment', 'lifestyle', 'adult', 'sport']
  65. # no finanical news on Sunday
  66. #if time.strftime('%w') == '0':
  67. # lookups.remove('finance')
  68. feeds = soup.findAll('ul', 'menuList clear')[0].findAll('li', attrs={'section':lookups})
  69. feeds = map(lambda x: (x.a['href'], x.text), feeds)
  70. feeds = list(feeds)
  71. print ('----------------------- The feeds are: %s' % feeds)
  72. ans = []
  73. for e in feeds:
  74. try:
  75. print ('e[1] is: %s | %s\n' % (e[1], e[0]))
  76. ans.append((e[1], scrap_feed(e)))
  77. except Exception as e:
  78. print('while processing feed: %s' % e)
  79. continue
  80. print ('############')
  81. print (ans)
  82. return ans
  83. def preprocess_html(self, soup):
  84. print('((((( begin article ))))')
  85. try:
  86. #print(soup)
  87. html = str(soup.find('h1')) + ''.join(str(t) for t in soup.findAll('div', 'content'))
  88. # download photo
  89. pic = soup.find('div', 'paragraph photoParagraph')
  90. #print (pic)
  91. if pic != None:
  92. html += '<a href="%s"><img src="%s"></img></a>' % (str(pic.a['href']), str(pic.img['src']))
  93. #print('>>>>>>>>>>>>>>> %s' % html)
  94. return BeautifulSoup(html)
  95. except Exception as e:
  96. print (e)
  97. print('other article...')
  98. print('((((( end article ))))')
  99. return soup
  100. def get_browser(self, *args, **kwargs):
  101. br = BasicNewsRecipe.get_browser(self)
  102. # br.set_header('User-Agent', value='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
  103. br.set_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
  104. br.set_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
  105. br.set_header('Accept-Language', 'en-US,en;q=0.5')
  106. return br