# -*- coding: utf-8 -*- import string, re #import HTMLParser from html.parser import HTMLParser from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class OrientalDailyPure(BasicNewsRecipe): title = 'Oriental Daily - ' + time.strftime('%d %b %Y') __author__ = 'Larry Chan' description = 'Oriental Daily, Hong Kong' publication_type = 'newspaper' language = 'zh' timefmt = ' [%a, %d %b, %Y]' masthead_url = 'http://orientaldaily.on.cc/img/v2/logo_odn.png' #cover_url = 'http://orientaldaily.on.cc/cnt/news/' + time.strftime('%Y%m%d') + '/photo/' + time.strftime('%m%d') + '-00174-001k1.jpg' cover_url = 'https://orientaldaily.on.cc/asset/main/%s/photo/337_sectMain.jpg' % time.strftime('%Y%m%d') #print ("cover %s" % cover_url) delay = 0 no_stylesheets = True extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' # keep_only_tags = [ # dict(name='h1'), # dict(name='a'), # dict(name='img'), # dict(name='div'), # dict(attrs={'div': 'content'}) # ] #dict(name='p', attrs={'class':['photoCaption','paragraph']}) #remove_tags = [dict(name=['script', 'input'])] HTMLParser.attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^\s>^\[\]{}\|\'\"]*))?') def parse_index(self): def extract_text(tag): return str(tag.contents[0]).replace('', '').replace('', '') def scrap_feed(feed): f_url = '%s%s' % (urlRoot, feed[0]) print ('feed url %s ' % f_url) soup = self.index_to_soup(f_url) # verify a section is available for download on the day this script is run. # skip a section if unavailable # for instance, finance section is unavailable on Sunday, so is "lifestyle" try: articles = soup.findAll('div', 'sectionList')[0].findAll('li') except: print ('--- this section [%s] is not available today ---' % feed[1]) raise Exception ('--- this section [%s] is not available today ---' % feed[1]) articles = map(lambda x:{'url': '%s%s' % (urlRoot, x.a['href']), 'title': x.findAll('div', attrs={'class' : 'text'})[0].text, 'date': strftime('%a, %d %b'), 'description': x.findAll('div', attrs={'class' : 'text'})[0].text, 'content': ''}, articles) ans = [] for article in articles: ans.append(article) return ans urlRoot = 'https://orientaldaily.on.cc' url = urlRoot print ('############ before soup') soup = self.index_to_soup(url) print ('############ after soup') #lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport'] lookups = ['news', 'china_world', 'finance', 'entertainment', 'lifestyle', 'adult', 'sport'] # no finanical news on Sunday #if time.strftime('%w') == '0': # lookups.remove('finance') feeds = soup.findAll('ul', 'menuList clear')[0].findAll('li', attrs={'section':lookups}) feeds = map(lambda x: (x.a['href'], x.text), feeds) feeds = list(feeds) print ('----------------------- The feeds are: %s' % feeds) ans = [] for e in feeds: try: print ('e[1] is: %s | %s\n' % (e[1], e[0])) ans.append((e[1], scrap_feed(e))) except Exception as e: print('while processing feed: %s' % e) continue print ('############') print (ans) return ans def preprocess_html(self, soup): print('((((( begin article ))))') try: #print(soup) html = str(soup.find('h1')) + ''.join(str(t) for t in soup.findAll('div', 'content')) # download photo pic = soup.find('div', 'paragraph photoParagraph') #print (pic) if pic != None: html += '' % (str(pic.a['href']), str(pic.img['src'])) #print('>>>>>>>>>>>>>>> %s' % html) return BeautifulSoup(html) except Exception as e: print (e) print('other article...') print('((((( end article ))))') return soup def get_browser(self, *args, **kwargs): br = BasicNewsRecipe.get_browser(self) # br.set_header('User-Agent', value='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36') br.set_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36') br.set_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8') br.set_header('Accept-Language', 'en-US,en;q=0.5') return br