|
@@ -49,47 +49,23 @@ class OrientalDailyPure(BasicNewsRecipe):
|
|
|
return str(tag.contents[0]).replace('<em>', '').replace('</em>', '')
|
|
return str(tag.contents[0]).replace('<em>', '').replace('</em>', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
-
|
|
|
|
|
- def old_scrap_feed(feed):
|
|
|
|
|
- f_url = '%s%s' % (urlRoot, feed[0])
|
|
|
|
|
- print (f_url)
|
|
|
|
|
- #f_html = urlopen(f_url).read()
|
|
|
|
|
- sf = self.index_to_soup(f_url)
|
|
|
|
|
- # extract all h2 headlines
|
|
|
|
|
- l_h2 = map(lambda x: extract_text(x), sf.findAll('h2'))
|
|
|
|
|
- l_h2 = list(l_h2)[:len(list(l_h2))-2]
|
|
|
|
|
-
|
|
|
|
|
- # for each headline, look for the feed title and feed url
|
|
|
|
|
-
|
|
|
|
|
-# print '--------------------'
|
|
|
|
|
- print ('LEVEL H2: %s' % l_h2)
|
|
|
|
|
-# for hl in l_h2:
|
|
|
|
|
-# print 'h1: ' + hl
|
|
|
|
|
-# print sf.findAll('ul', {'title': hl })[0].findAll('li')
|
|
|
|
|
- l_feed = map(lambda x: sf.findAll('ul', {'title': x })[0].findAll('li'), l_h2)
|
|
|
|
|
- print ('-----------l_feed')
|
|
|
|
|
- l_feed = [item for sublist in l_feed for item in sublist]
|
|
|
|
|
- print (l_feed[0])
|
|
|
|
|
- print ('------END -----l_feed')
|
|
|
|
|
-# l_feed = map(lambda x: {'url': '%s%s' % (urlRoot, x[0].a['href']), 'title': extract_text(x[0].a),
|
|
|
|
|
-# 'date': strftime('%a, %d %b'),
|
|
|
|
|
-# 'description': '%s' % (extract_text(x[0].a)),
|
|
|
|
|
-# 'content': ''}, l_feed)
|
|
|
|
|
- l_feed = map(lambda x: {'url': '%s%s' % (urlRoot, x.a['href']), 'title': extract_text(x.a),
|
|
|
|
|
- 'date': strftime('%a, %d %b'),
|
|
|
|
|
- 'description': '%s' % (extract_text(x.a)),
|
|
|
|
|
- 'content': ''}, l_feed)
|
|
|
|
|
- print ('****************************')
|
|
|
|
|
-
|
|
|
|
|
- l_feed = list(l_feed)
|
|
|
|
|
- print (list(l_feed))
|
|
|
|
|
- return l_feed
|
|
|
|
|
|
|
|
|
|
def scrap_feed(feed):
|
|
def scrap_feed(feed):
|
|
|
f_url = '%s%s' % (urlRoot, feed[0])
|
|
f_url = '%s%s' % (urlRoot, feed[0])
|
|
|
- print (f_url)
|
|
|
|
|
|
|
+ print ('feed url %s ' % f_url)
|
|
|
soup = self.index_to_soup(f_url)
|
|
soup = self.index_to_soup(f_url)
|
|
|
- articles = soup.findAll('div', 'sectionList')[0].findAll('li')
|
|
|
|
|
|
|
+ # verify a section is available for download on the day this script is run.
|
|
|
|
|
+ # skip a section if unavailable
|
|
|
|
|
+ # for instance, finance section is unavailable on Sunday, so is "lifestyle"
|
|
|
|
|
+ try:
|
|
|
|
|
+ articles = soup.findAll('div', 'sectionList')[0].findAll('li')
|
|
|
|
|
+ except:
|
|
|
|
|
+ print ('--- this section [%s] is not available today ---' % feed[1])
|
|
|
|
|
+ raise Exception ('--- this section [%s] is not available today ---' % feed[1])
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
articles = map(lambda x:{'url': '%s%s' % (urlRoot, x.a['href']),
|
|
articles = map(lambda x:{'url': '%s%s' % (urlRoot, x.a['href']),
|
|
|
'title': x.findAll('div', attrs={'class' : 'text'})[0].text,
|
|
'title': x.findAll('div', attrs={'class' : 'text'})[0].text,
|
|
|
'date': strftime('%a, %d %b'),
|
|
'date': strftime('%a, %d %b'),
|
|
@@ -102,15 +78,13 @@ class OrientalDailyPure(BasicNewsRecipe):
|
|
|
|
|
|
|
|
|
|
|
|
|
urlRoot = 'https://orientaldaily.on.cc'
|
|
urlRoot = 'https://orientaldaily.on.cc'
|
|
|
- #url = '%s/cnt/news/%s/index.html' % (urlRoot, time.strftime('%Y%m%d'))
|
|
|
|
|
url = urlRoot
|
|
url = urlRoot
|
|
|
- #url = '%s/cnt/news/%s/index.html' % (urlRoot, '20201127')
|
|
|
|
|
soup = self.index_to_soup(url)
|
|
soup = self.index_to_soup(url)
|
|
|
- #lookups = ['要聞港聞','兩岸國際','財經','娛樂','副刊','男極圈','體育','馬經','波經','社論專欄','慈善基金','昔日東方']
|
|
|
|
|
- lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport']
|
|
|
|
|
|
|
+ #lookups = ['news', 'china_world', 'finance', 'lifestyle', 'sport']
|
|
|
|
|
+ lookups = ['news', 'china_world', 'finance', 'entertainment', 'lifestyle', 'adult', 'sport']
|
|
|
# no finanical news on Sunday
|
|
# no finanical news on Sunday
|
|
|
- if time.strftime('%w') == '0':
|
|
|
|
|
- lookups.remove('finance')
|
|
|
|
|
|
|
+ #if time.strftime('%w') == '0':
|
|
|
|
|
+ # lookups.remove('finance')
|
|
|
|
|
|
|
|
feeds = soup.findAll('ul', 'menuList clear')[0].findAll('li', attrs={'section':lookups})
|
|
feeds = soup.findAll('ul', 'menuList clear')[0].findAll('li', attrs={'section':lookups})
|
|
|
feeds = map(lambda x: (x.a['href'], x.text), feeds)
|
|
feeds = map(lambda x: (x.a['href'], x.text), feeds)
|
|
@@ -119,8 +93,13 @@ class OrientalDailyPure(BasicNewsRecipe):
|
|
|
print ('----------------------- The feeds are: %s' % feeds)
|
|
print ('----------------------- The feeds are: %s' % feeds)
|
|
|
ans = []
|
|
ans = []
|
|
|
for e in feeds:
|
|
for e in feeds:
|
|
|
- print ('e[1] is: %s | %s\n' % (e[1], e[0]))
|
|
|
|
|
- ans.append((e[1], scrap_feed(e)))
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ print ('e[1] is: %s | %s\n' % (e[1], e[0]))
|
|
|
|
|
+ ans.append((e[1], scrap_feed(e)))
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print('while processing feed: %s' % e)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
print ('############')
|
|
print ('############')
|
|
|
print (ans)
|
|
print (ans)
|
|
|
return ans
|
|
return ans
|
|
@@ -141,7 +120,7 @@ class OrientalDailyPure(BasicNewsRecipe):
|
|
|
#print (pic)
|
|
#print (pic)
|
|
|
if pic != None:
|
|
if pic != None:
|
|
|
html += '<a href="%s"><img src="%s"></img></a>' % (str(pic.a['href']), str(pic.img['src']))
|
|
html += '<a href="%s"><img src="%s"></img></a>' % (str(pic.a['href']), str(pic.img['src']))
|
|
|
- print('>>>>>>>>>>>>>>> %s' % html)
|
|
|
|
|
|
|
+ #print('>>>>>>>>>>>>>>> %s' % html)
|
|
|
return BeautifulSoup(html)
|
|
return BeautifulSoup(html)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print (e)
|
|
print (e)
|