gogadmin
/
calibre-cron


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
							# -*- coding: utf-8 -*-
import string, re
#import HTMLParser 
from html.parser import HTMLParser
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class OrientalDailyPure(BasicNewsRecipe):

    title       = 'Oriental Daily - '  + time.strftime('%d %b %Y')
    __author__  = 'Larry Chan'
    description = 'Oriental Daily, Hong Kong'
    publication_type = 'newspaper'
    language    = 'zh'
    timefmt = ' [%a, %d %b, %Y]'
    masthead_url = 'http://orientaldaily.on.cc/img/v2/logo_odn.png'
    #cover_url = 'http://orientaldaily.on.cc/cnt/news/' + time.strftime('%Y%m%d') + '/photo/' + time.strftime('%m%d') + '-00174-001k1.jpg'
    cover_url = 'https://orientaldaily.on.cc/asset/main/%s/photo/337_sectMain.jpg' % time.strftime('%Y%m%d')
    #print ("cover %s" % cover_url)
    delay = 0

    no_stylesheets = True
    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
   

#    keep_only_tags    = [
#                       dict(name='h1'),
#                       dict(name='a'),                                  
#                       dict(name='img'),                                  
#                       dict(name='div'),                                  
#                       dict(attrs={'div': 'content'})                                  
#                        ]

    #dict(name='p', attrs={'class':['photoCaption','paragraph']})
    #remove_tags = [dict(name=['script', 'input'])]
    HTMLParser.attrfind = re.compile(
                        r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
                        r'(\'[^\']*\'|"[^"]*"|[^\s>^\[\]{}\|\'\"]*))?') 
    

    def parse_index(self):


       	def extract_text(tag):
            return str(tag.contents[0]).replace('<em>', '').replace('</em>', '')
 

        def scrap_feed(feed):
            f_url = '%s%s' % (urlRoot, feed[0])
            print ('feed url %s ' % f_url)
            soup = self.index_to_soup(f_url)
            # verify a section is available for download on the day this script is run.
            # skip a section if unavailable   
            # for instance, finance section is unavailable on Sunday, so is "lifestyle"
            try:
               articles = soup.findAll('div', 'sectionList')[0].findAll('li')
            except:
               print ('--- this section [%s] is not available today ---' % feed[1]) 
               raise Exception ('--- this section [%s] is not available today ---' % feed[1]) 


            articles = map(lambda x:{'url': '%s%s' % (urlRoot, x.a['href']), 
                            'title': x.findAll('div', attrs={'class' : 'text'})[0].text, 
                            'date': strftime('%a, %d %b'),
                            'description': x.findAll('div', attrs={'class' : 'text'})[0].text,
                            'content': ''}, articles)
            ans = []
            for article in articles:
                ans.append(article)    
            return ans 
               

        urlRoot = 'https://orientaldaily.on.cc'
        url = urlRoot 

        print ('############ before soup')

        soup = self.index_to_soup(url)

        print ('############ after soup')

        #lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport']
        lookups = ['news', 'china_world', 'finance', 'entertainment', 'lifestyle', 'adult', 'sport']
        # no finanical news on Sunday
        #if time.strftime('%w') == '0':
        #   lookups.remove('finance') 

        feeds = soup.findAll('ul', 'menuList clear')[0].findAll('li', attrs={'section':lookups})
        feeds = map(lambda x: (x.a['href'], x.text), feeds)
        feeds = list(feeds)

        print ('----------------------- The feeds are: %s' % feeds)
        ans = []
        for e in feeds:
            try:
               print ('e[1] is: %s | %s\n' % (e[1], e[0]))
               ans.append((e[1], scrap_feed(e)))
            except Exception as e:
               print('while processing feed: %s' % e) 
               continue

        print ('############')
        print (ans)
        return ans
  

    def preprocess_html(self, soup):
         
        print('((((( begin article ))))')
        try:
            #print(soup)
            html = str(soup.find('h1'))  + ''.join(str(t) for t in soup.findAll('div', 'content'))	
            # download photo
            pic = soup.find('div', 'paragraph photoParagraph')
            #print (pic)
            if pic != None:
               html += '<a href="%s"><img src="%s"></img></a>' % (str(pic.a['href']), str(pic.img['src'])) 
            #print('>>>>>>>>>>>>>>> %s' % html)
            return BeautifulSoup(html) 
        except Exception as e:
            print (e)
            print('other article...')	
        print('((((( end article ))))')
        return soup 


    def get_browser(self, *args, **kwargs):
        br = BasicNewsRecipe.get_browser(self)
#        br.set_header('User-Agent', value='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36') 
        br.set_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
        br.set_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
        br.set_header('Accept-Language', 'en-US,en;q=0.5')
        return br