Introduction to web scraping and real world task

Recently, I’ve been assigned a task to check whether the list of page news is too old or not.
That comes the part I need to go over the page source to get the date information.

  • requests
    Requests support us to get the response from the web.
    You can refer to the documentation of requests in here
Open a page where all the sites are listed Get all the links for the news site Get the date from the site news Assert the published date of the news must be less than 1 month til current date
def scroll_to_with_scroll_height(self, driver):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
ZEN_NEWS_NOT_CONTAINS_ADS_ITEM_CSS_SELECTOR = 'div[class] > a:not(.qc-link)[href]:not(.context-content)' \
':not(.zen-ads__context):not([href*="utm"])' \
def find_all_current_zen_except_ads_elements(self, driver):
return driver.find_elements_by_css_selector(NewTabZenLocators.ZEN_NEWS_NOT_CONTAINS_ADS_ITEM_CSS_SELECTOR)
def get_attribute_all_zen_except_ads_elements(self, driver, attribute_name):
attribute_value = []
for element in self.new_tab_zen_elem.find_all_current_zen_except_ads_elements(driver):
return attribute_value
for url in url_list:
response = None
response = requests.get(url)
except ConnectionError as e:
expect(response is not None, f'Assert response not None for site {url}')
expect(response.status_code == 200, f'Assert response status code for site {url}')
def get_published_time_of_web_page(self, response_text):
published_time = None
soup_instance = BeautifulSoup(response_text, features='html.parser', parse_only=SoupStrainer("head"))
meta_tags = soup_instance.find_all(name="meta")
for item in meta_tags:
property_value = item.get('property')
if property_value == 'article:published_time':
published_time = item.get('content')
if published_time is None:
soup_instance = BeautifulSoup(response_text, features='html.parser', parse_only=SoupStrainer('script',
"type": "application/ld+json"}))
list_json = soup_instance.findAll('script')
for each_json in list_json:
if 'datePublished' in each_json.text.strip():
import json
from json import JSONDecodeError
json_parse = json.loads(each_json.text.strip(), strict=False)
published_time = json_parse['datePublished']
except JSONDecodeError as e:
return published_time
import dateutil.parser
import datetime

def parse_string_to_date(string_datetime):
your_date = dateutil.parser.parse(string_datetime)

def how_many_days_til_now(string_datetime):
number_of_days = - parse_string_to_date(string_datetime)
return number_of_days.days
expect(response is not None, f'Assert response not None for site {url}')
expect(response.status_code == 200, f'Assert response status code for site {url}')
expect(how_many_days_til_now(published_time) <= 30, f'Verify date of page {url}')
# else:
# print(f'Url of the site which cannot get published date is : {url}')

A passionate automation engineer who strongly believes in “A man can do anything he wants if he puts in the work”.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store