Initial commit
This commit is contained in:
168130
src/Tools/scraper/authors.json
Normal file
168130
src/Tools/scraper/authors.json
Normal file
File diff suppressed because it is too large
Load Diff
3896538
src/Tools/scraper/quotes.json
Normal file
3896538
src/Tools/scraper/quotes.json
Normal file
File diff suppressed because it is too large
Load Diff
217
src/Tools/scraper/scraper.py
Normal file
217
src/Tools/scraper/scraper.py
Normal file
@@ -0,0 +1,217 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import time
|
||||
import re
|
||||
import os
|
||||
import signal
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
# Global abort flag
|
||||
abort_flag = False
|
||||
|
||||
# Global session for reuse
|
||||
session = requests.Session()
|
||||
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
global abort_flag
|
||||
print("\nAborting... Waiting for current writes to complete...")
|
||||
abort_flag = True
|
||||
|
||||
# Register signal handler for SIGINT (Ctrl+C)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
def get_authors(letter='a', page=1):
|
||||
"""Get authors from a specific letter and page"""
|
||||
url = f"https://www.azquotes.com/quotes/authors/{letter}/{page}"
|
||||
try:
|
||||
response = session.get(url)
|
||||
if response.status_code != 200:
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
author_elements = soup.select('table tbody tr td:first-child a')
|
||||
|
||||
authors = []
|
||||
for element in author_elements:
|
||||
authors.append({
|
||||
'person': element.get_text(strip=True),
|
||||
'href': element.get('href')
|
||||
})
|
||||
return authors
|
||||
except Exception as e:
|
||||
print(f"Error fetching authors for letter {letter}, page {page}: {e}")
|
||||
return []
|
||||
|
||||
def has_next_page_author(soup):
|
||||
"""Check if there's a next page"""
|
||||
pager = soup.select_one('div.pager:nth-child(6) > li:last-child > a')
|
||||
return pager is not None
|
||||
|
||||
def get_quotes_from_author(author_url):
|
||||
"""Get quotes from an author's page (handles pagination)"""
|
||||
global abort_flag
|
||||
all_quotes = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
if abort_flag:
|
||||
break
|
||||
# Construct URL with page parameter
|
||||
if page == 1:
|
||||
url = author_url
|
||||
else:
|
||||
url = f"{author_url}?p={page}"
|
||||
|
||||
try:
|
||||
response = session.get(url)
|
||||
if response.status_code != 200:
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
quote_elements = soup.select('[id^="title_quote_link_"]')
|
||||
|
||||
# Extract quotes
|
||||
quotes = []
|
||||
for element in quote_elements:
|
||||
quote_text = element.get_text(strip=True)
|
||||
if quote_text: # Only add non-empty quotes
|
||||
quotes.append(quote_text)
|
||||
|
||||
if not quotes: # No more quotes on this page
|
||||
break
|
||||
|
||||
all_quotes.extend(quotes)
|
||||
|
||||
# Check if there's a next page
|
||||
next_page = soup.select_one('.next > a')
|
||||
if not next_page:
|
||||
break
|
||||
|
||||
page += 1
|
||||
|
||||
except Exception as e:
|
||||
all_quotes = []
|
||||
print(f"Error fetching quotes for {url}: {e}")
|
||||
break
|
||||
|
||||
return all_quotes
|
||||
|
||||
def scrape_authors_for_letter(letter):
|
||||
"""Scrape authors for a specific letter"""
|
||||
all_authors = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
authors = get_authors(letter, page)
|
||||
if not authors:
|
||||
break
|
||||
all_authors.extend(authors)
|
||||
page += 1
|
||||
|
||||
return all_authors
|
||||
|
||||
def scrape_quotes_for_author(author):
|
||||
"""Scrape quotes for a single author"""
|
||||
try:
|
||||
# Extract author URL from href
|
||||
if not author['href']:
|
||||
return []
|
||||
|
||||
# Handle relative URLs
|
||||
if author['href'].startswith('/'):
|
||||
author_url = f"https://www.azquotes.com{author['href']}"
|
||||
else:
|
||||
author_url = author['href']
|
||||
|
||||
quotes = get_quotes_from_author(author_url)
|
||||
return [{'quote': quote, 'author': author['person']} for quote in quotes]
|
||||
except Exception as e:
|
||||
print(f"Error scraping quotes for {author['person']}: {e}")
|
||||
return []
|
||||
|
||||
def main():
|
||||
print("Scraping authors...")
|
||||
letters = 'abcdefghijklmnopqrstuvwxyz'
|
||||
|
||||
all_authors = []
|
||||
skip_authors = False
|
||||
if os.path.exists('authors.json'):
|
||||
choice = input("Authors file exists. Scrape authors again? (y/N): ")
|
||||
if choice.lower() != 'y':
|
||||
print("Skipping author scraping...")
|
||||
skip_authors = True
|
||||
with open('authors.json', 'r', encoding='utf-8') as f:
|
||||
all_authors = json.load(f)
|
||||
print(f"Loaded {len(all_authors)} authors from file.")
|
||||
|
||||
if not skip_authors:
|
||||
# Scrape authors with multithreading
|
||||
with ThreadPoolExecutor(max_workers=16) as executor:
|
||||
futures = [executor.submit(scrape_authors_for_letter, letter) for letter in letters]
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
authors = future.result()
|
||||
all_authors.extend(authors)
|
||||
except Exception as e:
|
||||
print(f"Error scraping authors: {e}")
|
||||
|
||||
# Save authors to JSON
|
||||
with open('authors.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_authors, f, indent=2)
|
||||
|
||||
print(f"Total authors scraped: {len(all_authors)}")
|
||||
|
||||
print("Scraping quotes...")
|
||||
|
||||
# Scrape quotes with multithreading
|
||||
all_quotes = []
|
||||
author_count = len(all_authors)
|
||||
|
||||
# restrict all_authors to non-extracted
|
||||
if os.path.exists('quotes.json'):
|
||||
with open('quotes.json', 'r', encoding='utf-8') as f:
|
||||
try:
|
||||
existing_quotes = json.load(f)
|
||||
all_quotes = existing_quotes
|
||||
print(f"Loaded {len(existing_quotes)} quotes")
|
||||
except Exception as e:
|
||||
# existing_quotes = []
|
||||
print(f"Unable to load existing quotes due to exception {e}")
|
||||
exit()
|
||||
existing_authors = set(quote['author'] for quote in existing_quotes)
|
||||
all_authors = [author for author in all_authors if author['person'] not in existing_authors]
|
||||
|
||||
current_count = author_count - len(all_authors)
|
||||
save_interval = 60 * 30
|
||||
last_saved = time.time()
|
||||
with ThreadPoolExecutor(max_workers=8) as executor:
|
||||
futures = [executor.submit(scrape_quotes_for_author, author) for author in all_authors]
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
if abort_flag:
|
||||
break
|
||||
quotes = future.result()
|
||||
all_quotes.extend(quotes)
|
||||
if abort_flag:
|
||||
break
|
||||
if (time.time() - last_saved > save_interval): # save every n seconds
|
||||
print("Saving quotes file...")
|
||||
# Save quotes to JSON
|
||||
with open('quotes.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_quotes, f, indent=2)
|
||||
print("Saved quotes file...")
|
||||
last_saved = time.time()
|
||||
current_count += 1
|
||||
print(f"Quotes extracted for author {quotes[0]['author']}. Total progress: {current_count / author_count * 100:.2f}% ({current_count}/{author_count})")
|
||||
except Exception as e:
|
||||
print(f"Error scraping quotes: {e}")
|
||||
|
||||
|
||||
with open('quotes.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_quotes, f, indent=2)
|
||||
print(f"Total quotes scraped: {len(all_quotes)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user