Initial commit

This commit is contained in:
2026-02-02 12:56:09 +01:00
commit d35bccbb9f
87 changed files with 4148118 additions and 0 deletions

168130
src/Tools/scraper/authors.json Normal file

File diff suppressed because it is too large Load Diff

3896538
src/Tools/scraper/quotes.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,217 @@
import requests
from bs4 import BeautifulSoup
import json
import time
import re
import os
import signal
from concurrent.futures import ThreadPoolExecutor, as_completed
# Global abort flag
abort_flag = False
# Global session for reuse
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
def signal_handler(sig, frame):
global abort_flag
print("\nAborting... Waiting for current writes to complete...")
abort_flag = True
# Register signal handler for SIGINT (Ctrl+C)
signal.signal(signal.SIGINT, signal_handler)
def get_authors(letter='a', page=1):
"""Get authors from a specific letter and page"""
url = f"https://www.azquotes.com/quotes/authors/{letter}/{page}"
try:
response = session.get(url)
if response.status_code != 200:
return []
soup = BeautifulSoup(response.content, 'html.parser')
author_elements = soup.select('table tbody tr td:first-child a')
authors = []
for element in author_elements:
authors.append({
'person': element.get_text(strip=True),
'href': element.get('href')
})
return authors
except Exception as e:
print(f"Error fetching authors for letter {letter}, page {page}: {e}")
return []
def has_next_page_author(soup):
"""Check if there's a next page"""
pager = soup.select_one('div.pager:nth-child(6) > li:last-child > a')
return pager is not None
def get_quotes_from_author(author_url):
"""Get quotes from an author's page (handles pagination)"""
global abort_flag
all_quotes = []
page = 1
while True:
if abort_flag:
break
# Construct URL with page parameter
if page == 1:
url = author_url
else:
url = f"{author_url}?p={page}"
try:
response = session.get(url)
if response.status_code != 200:
break
soup = BeautifulSoup(response.content, 'html.parser')
quote_elements = soup.select('[id^="title_quote_link_"]')
# Extract quotes
quotes = []
for element in quote_elements:
quote_text = element.get_text(strip=True)
if quote_text: # Only add non-empty quotes
quotes.append(quote_text)
if not quotes: # No more quotes on this page
break
all_quotes.extend(quotes)
# Check if there's a next page
next_page = soup.select_one('.next > a')
if not next_page:
break
page += 1
except Exception as e:
all_quotes = []
print(f"Error fetching quotes for {url}: {e}")
break
return all_quotes
def scrape_authors_for_letter(letter):
"""Scrape authors for a specific letter"""
all_authors = []
page = 1
while True:
authors = get_authors(letter, page)
if not authors:
break
all_authors.extend(authors)
page += 1
return all_authors
def scrape_quotes_for_author(author):
"""Scrape quotes for a single author"""
try:
# Extract author URL from href
if not author['href']:
return []
# Handle relative URLs
if author['href'].startswith('/'):
author_url = f"https://www.azquotes.com{author['href']}"
else:
author_url = author['href']
quotes = get_quotes_from_author(author_url)
return [{'quote': quote, 'author': author['person']} for quote in quotes]
except Exception as e:
print(f"Error scraping quotes for {author['person']}: {e}")
return []
def main():
print("Scraping authors...")
letters = 'abcdefghijklmnopqrstuvwxyz'
all_authors = []
skip_authors = False
if os.path.exists('authors.json'):
choice = input("Authors file exists. Scrape authors again? (y/N): ")
if choice.lower() != 'y':
print("Skipping author scraping...")
skip_authors = True
with open('authors.json', 'r', encoding='utf-8') as f:
all_authors = json.load(f)
print(f"Loaded {len(all_authors)} authors from file.")
if not skip_authors:
# Scrape authors with multithreading
with ThreadPoolExecutor(max_workers=16) as executor:
futures = [executor.submit(scrape_authors_for_letter, letter) for letter in letters]
for future in as_completed(futures):
try:
authors = future.result()
all_authors.extend(authors)
except Exception as e:
print(f"Error scraping authors: {e}")
# Save authors to JSON
with open('authors.json', 'w', encoding='utf-8') as f:
json.dump(all_authors, f, indent=2)
print(f"Total authors scraped: {len(all_authors)}")
print("Scraping quotes...")
# Scrape quotes with multithreading
all_quotes = []
author_count = len(all_authors)
# restrict all_authors to non-extracted
if os.path.exists('quotes.json'):
with open('quotes.json', 'r', encoding='utf-8') as f:
try:
existing_quotes = json.load(f)
all_quotes = existing_quotes
print(f"Loaded {len(existing_quotes)} quotes")
except Exception as e:
# existing_quotes = []
print(f"Unable to load existing quotes due to exception {e}")
exit()
existing_authors = set(quote['author'] for quote in existing_quotes)
all_authors = [author for author in all_authors if author['person'] not in existing_authors]
current_count = author_count - len(all_authors)
save_interval = 60 * 30
last_saved = time.time()
with ThreadPoolExecutor(max_workers=8) as executor:
futures = [executor.submit(scrape_quotes_for_author, author) for author in all_authors]
for future in as_completed(futures):
try:
if abort_flag:
break
quotes = future.result()
all_quotes.extend(quotes)
if abort_flag:
break
if (time.time() - last_saved > save_interval): # save every n seconds
print("Saving quotes file...")
# Save quotes to JSON
with open('quotes.json', 'w', encoding='utf-8') as f:
json.dump(all_quotes, f, indent=2)
print("Saved quotes file...")
last_saved = time.time()
current_count += 1
print(f"Quotes extracted for author {quotes[0]['author']}. Total progress: {current_count / author_count * 100:.2f}% ({current_count}/{author_count})")
except Exception as e:
print(f"Error scraping quotes: {e}")
with open('quotes.json', 'w', encoding='utf-8') as f:
json.dump(all_quotes, f, indent=2)
print(f"Total quotes scraped: {len(all_quotes)}")
if __name__ == "__main__":
main()