Initial commit

2026-02-02 12:56:09 +01:00
commit d35bccbb9f
87 changed files with 4148118 additions and 0 deletions
@@ -0,0 +1,217 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+import time
+import re
+import os
+import signal
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Global abort flag
+abort_flag = False
+
+# Global session for reuse
+session = requests.Session()
+session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
+
+def signal_handler(sig, frame):
+    global abort_flag
+    print("\nAborting... Waiting for current writes to complete...")
+    abort_flag = True
+
+# Register signal handler for SIGINT (Ctrl+C)
+signal.signal(signal.SIGINT, signal_handler)
+
+def get_authors(letter='a', page=1):
+    """Get authors from a specific letter and page"""
+    url = f"https://www.azquotes.com/quotes/authors/{letter}/{page}"
+    try:
+        response = session.get(url)
+        if response.status_code != 200:
+            return []
+
+        soup = BeautifulSoup(response.content, 'html.parser')
+        author_elements = soup.select('table tbody tr td:first-child a')
+
+        authors = []
+        for element in author_elements:
+            authors.append({
+                'person': element.get_text(strip=True),
+                'href': element.get('href')
+            })
+        return authors
+    except Exception as e:
+        print(f"Error fetching authors for letter {letter}, page {page}: {e}")
+        return []
+
+def has_next_page_author(soup):
+    """Check if there's a next page"""
+    pager = soup.select_one('div.pager:nth-child(6) > li:last-child > a')
+    return pager is not None
+
+def get_quotes_from_author(author_url):
+    """Get quotes from an author's page (handles pagination)"""
+    global abort_flag
+    all_quotes = []
+    page = 1
+
+    while True:
+        if abort_flag:
+            break
+        # Construct URL with page parameter
+        if page == 1:
+            url = author_url
+        else:
+            url = f"{author_url}?p={page}"
+
+        try:
+            response = session.get(url)
+            if response.status_code != 200:
+                break
+
+            soup = BeautifulSoup(response.content, 'html.parser')
+            quote_elements = soup.select('[id^="title_quote_link_"]')
+
+            # Extract quotes
+            quotes = []
+            for element in quote_elements:
+                quote_text = element.get_text(strip=True)
+                if quote_text:  # Only add non-empty quotes
+                    quotes.append(quote_text)
+
+            if not quotes:  # No more quotes on this page
+                break
+
+            all_quotes.extend(quotes)
+
+            # Check if there's a next page
+            next_page = soup.select_one('.next > a')
+            if not next_page:
+                break
+
+            page += 1
+
+        except Exception as e:
+            all_quotes = []
+            print(f"Error fetching quotes for {url}: {e}")
+            break
+
+    return all_quotes
+
+def scrape_authors_for_letter(letter):
+    """Scrape authors for a specific letter"""
+    all_authors = []
+    page = 1
+
+    while True:
+        authors = get_authors(letter, page)
+        if not authors:
+            break
+        all_authors.extend(authors)
+        page += 1
+
+    return all_authors
+
+def scrape_quotes_for_author(author):
+    """Scrape quotes for a single author"""
+    try:
+        # Extract author URL from href
+        if not author['href']:
+            return []
+
+        # Handle relative URLs
+        if author['href'].startswith('/'):
+            author_url = f"https://www.azquotes.com{author['href']}"
+        else:
+            author_url = author['href']
+
+        quotes = get_quotes_from_author(author_url)
+        return [{'quote': quote, 'author': author['person']} for quote in quotes]
+    except Exception as e:
+        print(f"Error scraping quotes for {author['person']}: {e}")
+        return []
+
+def main():
+    print("Scraping authors...")
+    letters = 'abcdefghijklmnopqrstuvwxyz'
+
+    all_authors = []
+    skip_authors = False
+    if os.path.exists('authors.json'):
+        choice = input("Authors file exists. Scrape authors again? (y/N): ")
+        if choice.lower() != 'y':
+            print("Skipping author scraping...")
+            skip_authors = True
+            with open('authors.json', 'r', encoding='utf-8') as f:
+                all_authors = json.load(f)
+            print(f"Loaded {len(all_authors)} authors from file.")
+
+    if not skip_authors:
+        # Scrape authors with multithreading
+        with ThreadPoolExecutor(max_workers=16) as executor:
+            futures = [executor.submit(scrape_authors_for_letter, letter) for letter in letters]
+            for future in as_completed(futures):
+                try:
+                    authors = future.result()
+                    all_authors.extend(authors)
+                except Exception as e:
+                    print(f"Error scraping authors: {e}")
+
+        # Save authors to JSON
+        with open('authors.json', 'w', encoding='utf-8') as f:
+            json.dump(all_authors, f, indent=2)
+
+        print(f"Total authors scraped: {len(all_authors)}")
+
+    print("Scraping quotes...")
+
+    # Scrape quotes with multithreading
+    all_quotes = []
+    author_count = len(all_authors)
+
+    # restrict all_authors to non-extracted
+    if os.path.exists('quotes.json'):
+        with open('quotes.json', 'r', encoding='utf-8') as f:
+            try:
+                existing_quotes = json.load(f)
+                all_quotes = existing_quotes
+                print(f"Loaded {len(existing_quotes)} quotes")
+            except Exception as e:
+                # existing_quotes = []
+                print(f"Unable to load existing quotes due to exception {e}")
+                exit()
+        existing_authors = set(quote['author'] for quote in existing_quotes)
+        all_authors = [author for author in all_authors if author['person'] not in existing_authors]
+
+    current_count = author_count - len(all_authors)
+    save_interval = 60 * 30
+    last_saved = time.time()
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        futures = [executor.submit(scrape_quotes_for_author, author) for author in all_authors]
+        for future in as_completed(futures):
+            try:
+                if abort_flag:
+                    break
+                quotes = future.result()
+                all_quotes.extend(quotes)
+                if abort_flag:
+                    break
+                if (time.time() - last_saved > save_interval): # save every n seconds
+                    print("Saving quotes file...")
+                    # Save quotes to JSON
+                    with open('quotes.json', 'w', encoding='utf-8') as f:
+                        json.dump(all_quotes, f, indent=2)
+                    print("Saved quotes file...")
+                    last_saved = time.time()
+                current_count += 1
+                print(f"Quotes extracted for author {quotes[0]['author']}. Total progress: {current_count / author_count * 100:.2f}% ({current_count}/{author_count})")
+            except Exception as e:
+                print(f"Error scraping quotes: {e}")
+
+
+    with open('quotes.json', 'w', encoding='utf-8') as f:
+        json.dump(all_quotes, f, indent=2)
+    print(f"Total quotes scraped: {len(all_quotes)}")
+
+if __name__ == "__main__":
+    main()