from PIL import Image
from IPython.display import display
import matplotlib.image as mpimg
= "hacker-element.png"
image_path
= Image.open(image_path)
image display(image)
Web Scrapping Using Python
This is a web scrapping task to find conflict/war related news articles from the internet. There is been quite a lot of that considering the Russia/Ukraine conflict, The South Sudan conflict, and the Palestine/Israel conflict just to mention the most reported cases.
Using Beautifulsoup + requests
Understanding website’s structure
Prior to scraping inspect the HTML source code of the web page to identify the elements you want to scrape
Set up your develpment environment
Create a virtual environment, follow prompts per your IDE. For VScode I pressed Ctrl+Shift+P
then searched Python: Create Environment
A beginner web scraper in Python is advised to start with requests
and beautifulsoup4
librarires which is what we will use.
import requests
from bs4 import BeautifulSoup
baseurl = "https://news.ycombinator.com"
user = ""
passd = ""
s = requests.Session()
data = {"goto": "news", "acct": user, "pw": passd}
r = s.post(f'{baseurl}', data=data)
soup = BeautifulSoup(r.text, 'html.parser')
if soup.find(id='logout') is not None:
print("Successfully logged in")
else:
print("Authentication error")
Inspect HTML element
Each post is wrapped in a <tr>
tag with the class athing
Scrape with requests + beautifulsoup4
import requests
from bs4 import BeautifulSoup
= requests.get("https://news.ycombinator.com/")
r = BeautifulSoup(r.text, 'html.parser')
soup = soup.find_all('tr', class_='athing')
links
= []
formatted_links for link in links:
= {
data 'id': link['id'],
'title': link.find_all("td")[2].a.text,
'url': link.find_all("td")[2].a['href'],
'rank': int(link.find_all("td")[0].span.text.replace('.', ''))
}
formatted_links.append(data)
formatted_links.append(data)
print(formatted_links)
[{'id': '44754697', 'title': 'New quantum state of matter found at interface of exotic materials', 'url': 'https://phys.org/news/2025-07-quantum-state-interface-exotic-materials.html', 'rank': 1}, {'id': '44754697', 'title': 'New quantum state of matter found at interface of exotic materials', 'url': 'https://phys.org/news/2025-07-quantum-state-interface-exotic-materials.html', 'rank': 1}, {'id': '44778936', 'title': 'Modern Node.js Patterns', 'url': 'https://kashw1n.com/blog/nodejs-2025/', 'rank': 2}, {'id': '44778936', 'title': 'Modern Node.js Patterns', 'url': 'https://kashw1n.com/blog/nodejs-2025/', 'rank': 2}, {'id': '44780353', 'title': 'So you want to parse a PDF?', 'url': 'https://eliot-jones.com/2025/8/pdf-parsing-xref', 'rank': 3}, {'id': '44780353', 'title': 'So you want to parse a PDF?', 'url': 'https://eliot-jones.com/2025/8/pdf-parsing-xref', 'rank': 3}, {'id': '44779428', 'title': 'Writing a good design document', 'url': 'https://grantslatton.com/how-to-design-document', 'rank': 4}, {'id': '44779428', 'title': 'Writing a good design document', 'url': 'https://grantslatton.com/how-to-design-document', 'rank': 4}, {'id': '44777760', 'title': 'Persona vectors: Monitoring and controlling character traits in language models', 'url': 'https://www.anthropic.com/research/persona-vectors', 'rank': 5}, {'id': '44777760', 'title': 'Persona vectors: Monitoring and controlling character traits in language models', 'url': 'https://www.anthropic.com/research/persona-vectors', 'rank': 5}, {'id': '44781523', 'title': 'A parser for TypeScript types, written in TypeScript types', 'url': 'https://github.com/easrng/tsints', 'rank': 6}, {'id': '44781523', 'title': 'A parser for TypeScript types, written in TypeScript types', 'url': 'https://github.com/easrng/tsints', 'rank': 6}, {'id': '44775563', 'title': "If you're remote, ramble", 'url': 'https://stephango.com/ramblings', 'rank': 7}, {'id': '44775563', 'title': "If you're remote, ramble", 'url': 'https://stephango.com/ramblings', 'rank': 7}, {'id': '44765562', 'title': 'Life, Work, Death and the Peasant: Family Formation', 'url': 'https://acoup.blog/2025/08/01/collections-life-work-death-and-the-peasant-part-iiia-family-formation/', 'rank': 8}, {'id': '44765562', 'title': 'Life, Work, Death and the Peasant: Family Formation', 'url': 'https://acoup.blog/2025/08/01/collections-life-work-death-and-the-peasant-part-iiia-family-formation/', 'rank': 8}, {'id': '44777766', 'title': 'How Python grew from a language to a community', 'url': 'https://thenewstack.io/how-python-grew-from-a-language-to-a-community/', 'rank': 9}, {'id': '44777766', 'title': 'How Python grew from a language to a community', 'url': 'https://thenewstack.io/how-python-grew-from-a-language-to-a-community/', 'rank': 9}, {'id': '44781116', 'title': 'Why doctors hate their computers (2018)', 'url': 'https://www.newyorker.com/magazine/2018/11/12/why-doctors-hate-their-computers', 'rank': 10}, {'id': '44781116', 'title': 'Why doctors hate their computers (2018)', 'url': 'https://www.newyorker.com/magazine/2018/11/12/why-doctors-hate-their-computers', 'rank': 10}, {'id': '44780878', 'title': 'Typed languages are better suited for vibecoding', 'url': 'https://solmaz.io/typed-languages-are-better-suited-for-vibecoding', 'rank': 11}, {'id': '44780878', 'title': 'Typed languages are better suited for vibecoding', 'url': 'https://solmaz.io/typed-languages-are-better-suited-for-vibecoding', 'rank': 11}, {'id': '44782046', 'title': 'Rising Young Worker Despair in the United States', 'url': 'https://www.nber.org/papers/w34071', 'rank': 12}, {'id': '44782046', 'title': 'Rising Young Worker Despair in the United States', 'url': 'https://www.nber.org/papers/w34071', 'rank': 12}, {'id': '44743631', 'title': 'C++: "model of the hardware" vs. "model of the compiler" (2018)', 'url': 'http://ithare.com/c-model-of-the-hardware-vs-model-of-the-compiler/', 'rank': 13}, {'id': '44743631', 'title': 'C++: "model of the hardware" vs. "model of the compiler" (2018)', 'url': 'http://ithare.com/c-model-of-the-hardware-vs-model-of-the-compiler/', 'rank': 13}, {'id': '44780540', 'title': 'How to grow almost anything', 'url': 'https://howtogrowalmostanything.notion.site/htgaa25', 'rank': 14}, {'id': '44780540', 'title': 'How to grow almost anything', 'url': 'https://howtogrowalmostanything.notion.site/htgaa25', 'rank': 14}, {'id': '44767508', 'title': 'Efficiently Generating a Number in a Range (2018)', 'url': 'https://www.pcg-random.org/posts/bounded-rands.html', 'rank': 15}, {'id': '44767508', 'title': 'Efficiently Generating a Number in a Range (2018)', 'url': 'https://www.pcg-random.org/posts/bounded-rands.html', 'rank': 15}, {'id': '44745441', 'title': "2,500-year-old Siberian 'ice mummy' had intricate tattoos, imaging reveals", 'url': 'https://www.bbc.com/news/articles/c4gzx0zm68vo', 'rank': 16}, {'id': '44745441', 'title': "2,500-year-old Siberian 'ice mummy' had intricate tattoos, imaging reveals", 'url': 'https://www.bbc.com/news/articles/c4gzx0zm68vo', 'rank': 16}, {'id': '44765730', 'title': 'Welcome to url.town, population 465', 'url': 'https://url.town/', 'rank': 17}, {'id': '44765730', 'title': 'Welcome to url.town, population 465', 'url': 'https://url.town/', 'rank': 17}, {'id': '44775700', 'title': 'Tokens are getting more expensive', 'url': 'https://ethanding.substack.com/p/ai-subscriptions-get-short-squeezed', 'rank': 18}, {'id': '44775700', 'title': 'Tokens are getting more expensive', 'url': 'https://ethanding.substack.com/p/ai-subscriptions-get-short-squeezed', 'rank': 18}, {'id': '44760583', 'title': 'Survival at High Altitudes: Wheel-Well Passengers (1996)', 'url': 'https://rosap.ntl.bts.gov/view/dot/57536', 'rank': 19}, {'id': '44760583', 'title': 'Survival at High Altitudes: Wheel-Well Passengers (1996)', 'url': 'https://rosap.ntl.bts.gov/view/dot/57536', 'rank': 19}, {'id': '44781189', 'title': 'Poorest US workers hit hardest by slowing wage growth', 'url': 'https://www.ft.com/content/cfb77a53-fef8-4382-b102-c217e0aa4b25', 'rank': 20}, {'id': '44781189', 'title': 'Poorest US workers hit hardest by slowing wage growth', 'url': 'https://www.ft.com/content/cfb77a53-fef8-4382-b102-c217e0aa4b25', 'rank': 20}, {'id': '44774104', 'title': 'Twenty Eighth International Obfuscated C Code Contest', 'url': 'https://www.ioccc.org/2024/index.html', 'rank': 21}, {'id': '44774104', 'title': 'Twenty Eighth International Obfuscated C Code Contest', 'url': 'https://www.ioccc.org/2024/index.html', 'rank': 21}, {'id': '44764696', 'title': 'A dedicated skin-to-brain circuit for cool sensation in mice', 'url': 'https://www.sciencedaily.com/releases/2025/07/250730030354.htm', 'rank': 22}, {'id': '44764696', 'title': 'A dedicated skin-to-brain circuit for cool sensation in mice', 'url': 'https://www.sciencedaily.com/releases/2025/07/250730030354.htm', 'rank': 22}, {'id': '44777055', 'title': 'This Old SGI: notes and memoirs on the Silicon Graphics 4D series (1996)', 'url': 'https://archive.irixnet.org/thisoldsgi/', 'rank': 23}, {'id': '44777055', 'title': 'This Old SGI: notes and memoirs on the Silicon Graphics 4D series (1996)', 'url': 'https://archive.irixnet.org/thisoldsgi/', 'rank': 23}, {'id': '44775830', 'title': 'How to make almost anything (2019)', 'url': 'https://fab.cba.mit.edu/classes/863.19/CBA/people/dsculley/index.html', 'rank': 24}, {'id': '44775830', 'title': 'How to make almost anything (2019)', 'url': 'https://fab.cba.mit.edu/classes/863.19/CBA/people/dsculley/index.html', 'rank': 24}, {'id': '44779839', 'title': 'Everything to know about UniFi OS Server', 'url': 'https://deluisio.com/networking/unifi/2025/08/03/everything-you-need-to-know-about-unifi-os-server-before-you-waste-time-testing-it/', 'rank': 25}, {'id': '44779839', 'title': 'Everything to know about UniFi OS Server', 'url': 'https://deluisio.com/networking/unifi/2025/08/03/everything-you-need-to-know-about-unifi-os-server-before-you-waste-time-testing-it/', 'rank': 25}, {'id': '44762397', 'title': 'Show HN: Schematra – Sinatra-inspired minimal web framework for Chicken Scheme', 'url': 'https://github.com/rolandoam/schematra', 'rank': 26}, {'id': '44762397', 'title': 'Show HN: Schematra – Sinatra-inspired minimal web framework for Chicken Scheme', 'url': 'https://github.com/rolandoam/schematra', 'rank': 26}, {'id': '44754789', 'title': 'The first lunar road trip', 'url': 'https://nautil.us/the-first-lunar-road-trip-1227738/', 'rank': 27}, {'id': '44754789', 'title': 'The first lunar road trip', 'url': 'https://nautil.us/the-first-lunar-road-trip-1227738/', 'rank': 27}, {'id': '44771808', 'title': 'Lina Khan points to Figma IPO as vindication of M&A scrutiny', 'url': 'https://techcrunch.com/2025/08/02/lina-khan-points-to-figma-ipo-as-vindication-for-ma-scrutiny/', 'rank': 28}, {'id': '44771808', 'title': 'Lina Khan points to Figma IPO as vindication of M&A scrutiny', 'url': 'https://techcrunch.com/2025/08/02/lina-khan-points-to-figma-ipo-as-vindication-for-ma-scrutiny/', 'rank': 28}, {'id': '44780552', 'title': 'Learnable Programming (2012)', 'url': 'https://worrydream.com/LearnableProgramming/', 'rank': 29}, {'id': '44780552', 'title': 'Learnable Programming (2012)', 'url': 'https://worrydream.com/LearnableProgramming/', 'rank': 29}, {'id': '44779178', 'title': 'Shrinking freshwater availability increasing land contribution to sea level rise', 'url': 'https://news.asu.edu/20250725-environment-and-sustainability-new-global-study-shows-freshwater-disappearing-alarming', 'rank': 30}, {'id': '44779178', 'title': 'Shrinking freshwater availability increasing land contribution to sea level rise', 'url': 'https://news.asu.edu/20250725-environment-and-sustainability-new-global-study-shows-freshwater-disappearing-alarming', 'rank': 30}]
Store data as .csv
import csv
file = 'hacker_news_posts.csv'
with open(file, 'w', newline="") as f:
= csv.DictWriter(f, fieldnames=['id', 'title', 'url', 'rank'])
writer
writer.writeheader()for row in formatted_links:
writer.writerow(row)
Store data in PostgreSQL
Step 1: Installing PostgreSQL
Follow the PostgreSQL download page for downloads and installation
Step 2: Creating a Database Table
First you’ll need a table
#start service
sudo systemctl start postgresql.service
#log in as a superuser
sudo -i -u postgres
CREATE DATABASE scrape_demo;
CREATE TABLE "hn_links" (
"id" INTEGER NOT NULL,
"title" VARCHAR NOT NULL,
"url" VARCHAR NOT NULL,
"rank" INTEGER NOT NULL
);
Step 3: Install Psycopg2 to Connect to PostgreSQL
pip install psycopg2
Establish connection to the database
Ensure you set password for postgres user, which logs without a password by default
sudo -u postgres psql
postgres=# ALTER USER postgres PASSWORD 'myPassword';
ALTER ROLE
import psycopg2
import os
import dotenv
dotenv.load_dotenv()
= os.getenv("pass")
p = "hn_links"
table_name = "hacker_news_posts.csv"
csv_path
= psycopg2.connect(host="127.0.0.1", port="5432", user="postgres", password = p,database="scrape_demo")
con
# Get a database cursor
= con.cursor()
cur
= requests.get('https://news.ycombinator.com')
r = BeautifulSoup(r.text, 'html.parser')
soup = soup.findAll('tr', class_='athing')
links
for link in links:
"""
cur.execute( INSERT INTO hn_links (id, title, url, rank)
VALUES (%s, %s, %s, %s)
""",
('id'],
link['td')[2].a.text,
link.find_all('td')[2].a['href'],
link.find_all(int(link.find_all('td')[0].span.text.replace('.', ''))
)
)
# Commit the data
con.commit()
# Close our database connections
cur.close() con.close()
/tmp/ipykernel_21147/1184676145.py:18: DeprecationWarning: Call to deprecated method findAll. (Replaced by find_all) -- Deprecated since version 4.0.0.
links = soup.findAll('tr', class_='athing')
Using ScapingBee Python Client
ScrapingBee is a subscription API providing a way to bypass any website’s anti-scraping measures.
from scrapingbee import ScrapingBeeClient
import json
import pandas as pd
dotenv.load_dotenv()= os.getenv("spring_bee_api_key")
key
= ScrapingBeeClient(api_key=key)
sb_client = "https://www.aljazeera.com/"
url
= ScrapingBeeClient(api_key=key)
client
def google_news_headlines_api(country_code='US'):
= {
extract_rules "news": {
"selector": "article",
"type": "list",
"output": {
"title": ".gPFEn,.JtKRv",
"source": ".vr1PYe",
"time": "time@datetime",
"author": ".bInasb",
"link": ".WwrzSb@href"
}
}
}
= {
js_scenario "instructions":[
"evaluate":"document.querySelectorAll('.WwrzSb').forEach( (e) => e.href = e.href );"}
{
]
}
= client.get(
response f'https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtVnVHZ0pWVXlnQVAB?&gl={country_code}',
={
params"custom_google": "true",
"wait_for": ".bInasb",
"extract_rules": extract_rules,
"js_scenario": js_scenario,
},=2
retries
)
if response.text.startswith('{"message":"Invalid api key:'):
return f"Oops! It seems you may have missed adding your API KEY or you are using an incorrect key.\nGet your free API KEY and 1000 free scraping credits by signing up to our platform here: https://app.scrapingbee.com/account/register"
else:
def get_info():
if len(response.json()['news']) == 0:
return "FAILED TO RETRIEVE NEWS"
else:
return "SUCCESS"
return pd.DataFrame({
'count': len(response.json()['news']),
'news_extracts': response.json()['news'],
'info': f"{response.status_code} {get_info()}",
})#country_code: Set the news location; US, IN, etc.
= google_news_headlines_api(country_code='US')
df
print(df.iloc[:10])
count news_extracts info
0 263 {'title': 'Texas Democrats Leave State to Stop... 200 SUCCESS
1 263 {'title': 'Democrats flee Texas to block Repub... 200 SUCCESS
2 263 {'title': 'A Texas Democratic lawmaker on thei... 200 SUCCESS
3 263 {'title': 'Texas Democrats flee state to preve... 200 SUCCESS
4 263 {'title': 'Videos of emaciated hostages condem... 200 SUCCESS
5 263 {'title': 'Hamas says it will allow aid for ho... 200 SUCCESS
6 263 {'title': 'Netanyahu asks Red Cross to help ho... 200 SUCCESS
7 263 {'title': 'Hamas says open to ICRC delivering ... 200 SUCCESS
8 263 {'title': 'White House advisers defend Trump’s... 200 SUCCESS
9 263 {'title': 'Trump Fired America’s Economic Data... 200 SUCCESS
Web scraping with Scrapy
Scrapy is a web scraping framework using an event-driven networking infrastracture built around an asynchronous network engine that allows for more efficiency and scalability. It is made of a crawler that handles low-level logic, and a spider that is provider by the user to help the crawler generate request, parse and retrieve data.
In this section we use scrapy to scrape product listings available at web-scraping.dev, but first some house-keeping.
To install scrapy run pip install scrapy
or better still add scrapy
to your project’s requirements.txt
and run pip install -r requirements.txt
. Start a scrapy project by running scrapy startproject <project-name> <project-directory>
in terminal. Some scrapy commands below:
!scrapy --help
Scrapy 2.13.3 - active project: webscrapingdev
Usage:
scrapy <command> [options] [args]
Available commands:
bench Run quick benchmark test
check Check spider contracts
crawl Run a spider
edit Edit spider
fetch Fetch a URL using the Scrapy downloader
genspider Generate new spider using pre-defined templates
list List available spiders
parse Parse URL (using its spider) and print the results
runspider Run a self-contained spider (without creating a project)
settings Get settings values
shell Interactive scraping console
startproject Create new project
version Print Scrapy version
view Open URL in browser, as seen by Scrapy
Use "scrapy <command> -h" to see more info about a command
Creating a spider
run scrapy genspider <name> <host-to-scrape>
!scrapy genspider products web-scraping.dev
Spider 'products' already exists in module:
webscrapingdev.spiders.products
!scrapy list
!tree
products . ├── article-class.png ├── hacker-element.png ├── hacker_news_posts.csv ├── LICENSE ├── producthunt.json ├── README.md ├── requirements.txt ├── results.json ├── scrapy.cfg ├── web-scrap_files │ ├── figure-html │ │ └── cell-2-output-1.png │ └── libs │ ├── bootstrap │ │ ├── bootstrap-b9f025fa521194ab51f5de92fbd134be.min.css │ │ ├── bootstrap-icons.css │ │ ├── bootstrap-icons.woff │ │ └── bootstrap.min.js │ ├── clipboard │ │ └── clipboard.min.js │ └── quarto-html │ ├── anchor.min.js │ ├── popper.min.js │ ├── quarto.js │ ├── quarto-syntax-highlighting-37eea08aefeeee20ff55810ff984fec1.css │ ├── tabsets │ │ └── tabsets.js │ ├── tippy.css │ └── tippy.umd.min.js ├── web-scrap.html ├── webscrapingdev │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── __pycache__ │ │ ├── __init__.cpython-312.pyc │ │ └── settings.cpython-312.pyc │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── products.py │ └── __pycache__ │ ├── __init__.cpython-312.pyc │ └── products.cpython-312.pyc ├── web-scrap.ipynb ├── web-scrap.py ├── web-scrap.qmd └── web-scrap.quarto_ipynb 12 directories, 38 files
if you open the generated spider - products.py
, you’ll find the following
import scrapy
class ProductsSpider(scrapy.Spider):
name = "products"
allowed_domains = ["web-scraping.dev"]
start_urls = ["https://web-scraping.dev"]
def parse(self, response):
pass
name
is used as a reference to the spider forscrapy
commands likecrawl
` - this would run the scraper allowed_domains
is a safety feauture restricting this spider to crawl only particular domains.start_urls
indicates the spider starting point whileparse()
is the first callback to execute above instructions.
Adding crawling logic
We want our start_urls
to be some topic directories e.g., https://www.producthunt.com/topics/developer-tools and our parse()
callback method to find all product links and schedule them to be scrapped:
# /spiders/products.py
import scrapy
from scrapy.http import Response, Request
class ProductsSpider(scrapy.Spider):
name = 'products'
allowed_domains = ['web-scraping.dev']
start_urls = [
'https://web-scraping.dev/products',
]
def parse(self, response: Response):
product_urls = response.xpath(
"//div[@class='row product']/div/h3/a/@href"
).getall()
for url in product_urls:
yield Request(url, callback=self.parse_product)
# or shortcut in scrapy >2.0
# yield from response.follow_all(product_urls, callback=self.parse_product)
def parse_product(self, response: Response):
print(response)
Adding Parsing Logic
Populate parse_product()
# /spiders/products.py
...
def parse_product(self, response: Response):
yield {
"title": response.xpath("//h3[contains(@class, 'product-title')]/text()").get(),
"price": response.xpath("//span[contains(@class, 'product-price')]/text()").get(),
"image": response.xpath("//div[contains(@class, 'product-image')]/img/@src").get(),
"description": response.xpath("//p[contains(@class, 'description')]/text()").get()
}
Basic Settings
Adjust recommended settings:
# settings.py
# will ignore /robots.txt rules that might prevent scraping
ROBOTSTXT_OBEY = False
# will cache all request to /httpcache directory which makes running spiders in development much quicker
# tip: to refresh cache just delete /httpcache directory
HTTPCACHE_ENABLED = True
# while developing we want to see debug logs
LOG_LEVEL = "DEBUG" # or "INFO" in production
# to avoid basic bot detection we want to set some basic headers
DEFAULT_REQUEST_HEADERS = {
# we should use headers
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en',
}
Running Spiders
Either through the scrapy
command or explicitly calling scrapy using a Python script.
%%capture
!scrapy crawl products
Saving results
%%capture
!scrapy crawl products --output results.json
!tree
. ├── article-class.png ├── hacker-element.png ├── hacker_news_posts.csv ├── LICENSE ├── producthunt.json ├── README.md ├── requirements.txt ├── results.json ├── scrapy.cfg ├── web-scrap_files │ ├── figure-html │ │ └── cell-2-output-1.png │ └── libs │ ├── bootstrap │ │ ├── bootstrap-b9f025fa521194ab51f5de92fbd134be.min.css │ │ ├── bootstrap-icons.css │ │ ├── bootstrap-icons.woff │ │ └── bootstrap.min.js │ ├── clipboard │ │ └── clipboard.min.js │ └── quarto-html │ ├── anchor.min.js │ ├── popper.min.js │ ├── quarto.js │ ├── quarto-syntax-highlighting-37eea08aefeeee20ff55810ff984fec1.css │ ├── tabsets │ │ └── tabsets.js │ ├── tippy.css │ └── tippy.umd.min.js ├── web-scrap.html ├── webscrapingdev │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── __pycache__ │ │ ├── __init__.cpython-312.pyc │ │ └── settings.cpython-312.pyc │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── products.py │ └── __pycache__ │ ├── __init__.cpython-312.pyc │ └── products.cpython-312.pyc ├── web-scrap.ipynb ├── web-scrap.py ├── web-scrap.qmd └── web-scrap.quarto_ipynb 12 directories, 38 files
import json
= 'results.json'
json_file with open(json_file) as f:
= json.load(f)
j_obj
= json.dumps(j_obj, indent=2)
json_fmt print(json_fmt)
[
{
"title": "Blue Energy Potion",
"price": "$4.99",
"image": "https://web-scraping.dev/assets/products/blue-potion.webp",
"description": "Ignite your gaming sessions with our 'Blue Energy Potion', a premium energy drink crafted for dedicated gamers. Inspired by the classic video game potions, this energy drink provides a much-needed boost to keep you focused and energized. It's more than just an energy drink - it's an ode to the gaming culture, packaged in an aesthetically pleasing potion-like bottle that'll make you feel like you're in your favorite game world. Drink up and game on!"
},
{
"title": "Red Energy Potion",
"price": "$4.99",
"image": "https://web-scraping.dev/assets/products/red-potion.webp",
"description": "Elevate your game with our 'Red Potion', an extraordinary energy drink that's as enticing as it is effective. This fiery red potion delivers an explosive berry flavor and an energy kick that keeps you at the top of your game. Are you ready to level up?"
},
{
"title": "Teal Energy Potion",
"price": "$4.99",
"image": "https://web-scraping.dev/assets/products/teal-potion.webp",
"description": "Experience a surge of vitality with our 'Teal Potion', an exceptional energy drink designed for the gaming community. With its intriguing teal color and a flavor that keeps you asking for more, this potion is your best companion during those long gaming nights. Every sip is an adventure - let the quest begin!"
},
{
"title": "Dark Red Energy Potion",
"price": "$4.99",
"image": "https://web-scraping.dev/assets/products/darkred-potion.webp",
"description": "Unleash the power within with our 'Dark Red Potion', an energy drink as intense as the games you play. Its deep red color and bold cherry cola flavor are as inviting as they are invigorating. Bring out the best in your gaming performance, and unlock your full potential."
},
{
"title": "Box of Chocolate Candy",
"price": "$9.99 ",
"image": "https://web-scraping.dev/assets/products/orange-chocolate-box-small-1.webp",
"description": "Indulge your sweet tooth with our Box of Chocolate Candy. Each box contains an assortment of rich, flavorful chocolates with a smooth, creamy filling. Choose from a variety of flavors including zesty orange and sweet cherry. Whether you're looking for the perfect gift or just want to treat yourself, our Box of Chocolate Candy is sure to satisfy."
}
]