# -*- coding: utf-8 -*-
import string, re
#import HTMLParser
from html.parser import HTMLParser
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class OrientalDailyPure(BasicNewsRecipe):
title = 'Oriental Daily - ' + time.strftime('%d %b %Y')
__author__ = 'Larry Chan'
description = 'Oriental Daily, Hong Kong'
publication_type = 'newspaper'
language = 'zh'
timefmt = ' [%a, %d %b, %Y]'
masthead_url = 'http://orientaldaily.on.cc/img/v2/logo_odn.png'
#cover_url = 'http://orientaldaily.on.cc/cnt/news/' + time.strftime('%Y%m%d') + '/photo/' + time.strftime('%m%d') + '-00174-001k1.jpg'
cover_url = 'https://orientaldaily.on.cc/asset/main/%s/photo/337_sectMain.jpg' % time.strftime('%Y%m%d')
#print ("cover %s" % cover_url)
delay = 0
no_stylesheets = True
extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
# keep_only_tags = [
# dict(name='h1'),
# dict(name='a'),
# dict(name='img'),
# dict(name='div'),
# dict(attrs={'div': 'content'})
# ]
#dict(name='p', attrs={'class':['photoCaption','paragraph']})
#remove_tags = [dict(name=['script', 'input'])]
HTMLParser.attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s>^\[\]{}\|\'\"]*))?')
def parse_index(self):
def extract_text(tag):
return str(tag.contents[0]).replace('', '').replace('', '')
def old_scrap_feed(feed):
f_url = '%s%s' % (urlRoot, feed[0])
print (f_url)
#f_html = urlopen(f_url).read()
sf = self.index_to_soup(f_url)
# extract all h2 headlines
l_h2 = map(lambda x: extract_text(x), sf.findAll('h2'))
l_h2 = list(l_h2)[:len(list(l_h2))-2]
# for each headline, look for the feed title and feed url
# print '--------------------'
print ('LEVEL H2: %s' % l_h2)
# for hl in l_h2:
# print 'h1: ' + hl
# print sf.findAll('ul', {'title': hl })[0].findAll('li')
l_feed = map(lambda x: sf.findAll('ul', {'title': x })[0].findAll('li'), l_h2)
print ('-----------l_feed')
l_feed = [item for sublist in l_feed for item in sublist]
print (l_feed[0])
print ('------END -----l_feed')
# l_feed = map(lambda x: {'url': '%s%s' % (urlRoot, x[0].a['href']), 'title': extract_text(x[0].a),
# 'date': strftime('%a, %d %b'),
# 'description': '%s' % (extract_text(x[0].a)),
# 'content': ''}, l_feed)
l_feed = map(lambda x: {'url': '%s%s' % (urlRoot, x.a['href']), 'title': extract_text(x.a),
'date': strftime('%a, %d %b'),
'description': '%s' % (extract_text(x.a)),
'content': ''}, l_feed)
print ('****************************')
l_feed = list(l_feed)
print (list(l_feed))
return l_feed
def scrap_feed(feed):
f_url = '%s%s' % (urlRoot, feed[0])
print (f_url)
soup = self.index_to_soup(f_url)
articles = soup.findAll('div', 'sectionList')[0].findAll('li')
articles = map(lambda x:{'url': '%s%s' % (urlRoot, x.a['href']),
'title': x.findAll('div', attrs={'class' : 'text'})[0].text,
'date': strftime('%a, %d %b'),
'description': x.findAll('div', attrs={'class' : 'text'})[0].text,
'content': ''}, articles)
ans = []
for article in articles:
ans.append(article)
return ans
urlRoot = 'https://orientaldaily.on.cc'
#url = '%s/cnt/news/%s/index.html' % (urlRoot, time.strftime('%Y%m%d'))
url = urlRoot
#url = '%s/cnt/news/%s/index.html' % (urlRoot, '20201127')
soup = self.index_to_soup(url)
#lookups = ['要聞港聞','兩岸國際','財經','娛樂','副刊','男極圈','體育','馬經','波經','社論專欄','慈善基金','昔日東方']
lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport']
# no finanical news on Sunday
if time.strftime('%w') == '0':
lookups.remove('finance')
feeds = soup.findAll('ul', 'menuList clear')[0].findAll('li', attrs={'section':lookups})
feeds = map(lambda x: (x.a['href'], x.text), feeds)
feeds = list(feeds)
print ('----------------------- The feeds are: %s' % feeds)
ans = []
for e in feeds:
print ('e[1] is: %s | %s\n' % (e[1], e[0]))
ans.append((e[1], scrap_feed(e)))
print ('############')
print (ans)
return ans
def preprocess_html(self, soup):
print('((((( begin article ))))')
try:
#print(soup)
html = str(soup.find('h1')) + ''.join(str(t) for t in soup.findAll('div', 'content'))
# download photo
pic = soup.find('div', 'paragraph photoParagraph')
#print (pic)
if pic != None:
html += '
' % (str(pic.a['href']), str(pic.img['src']))
print('>>>>>>>>>>>>>>> %s' % html)
return BeautifulSoup(html)
except Exception as e:
print (e)
print('other article...')
print('((((( end article ))))')
return soup
def get_browser(self, *args, **kwargs):
br = BasicNewsRecipe.get_browser(self)
br.set_header('User-Agent', value='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
return br