import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import sqlite3
import threading
import queue
import tkinter as tk
from tkinter import ttk, messagebox
# データベース設定
DB_NAME = "gui_search_engine.db"
conn = sqlite3.connect(DB_NAME, check_same_thread=False)
cursor = conn.cursor()
# テーブル作成
cursor.execute("""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE,
title TEXT,
description TEXT,
content TEXT
)
""")
conn.commit()
# グローバル変数
visited = set()
visited_lock = threading.Lock()
task_queue = queue.Queue()
MAX_THREADS = 5
# ページをデータベースに保存
def save_page_to_db(url, title, description, content):
try:
cursor.execute("INSERT INTO pages (url, title, description, content) VALUES (?, ?, ?, ?)",
(url, title, description, content))
conn.commit()
except sqlite3.IntegrityError:
pass # URL重複時は無視
# URL正規化
def normalize_url(base, link):
return urljoin(base, link).split('#')[0]
# クローラー
def crawl(url, domain, status_label):
with visited_lock:
if url in visited:
return
visited.add(url)
try:
response = requests.get(url, timeout=10)
soup = BeautifulSoup(response.content, "html.parser")
# メタデータ収集
title = soup.title.string if soup.title else "No Title"
description_tag = soup.find("meta", attrs={"name": "description"})
description = description_tag["content"] if description_tag else "No Description"
content = soup.get_text()
# データベースに保存
save_page_to_db(url, title, description, content)
# ステータス更新
status_label.config(text=f"Crawling: {url}")
# 次のURLを収集
for link in soup.find_all('a', href=True):
full_url = normalize_url(url, link['href'])
if urlparse(full_url).netloc == domain:
task_queue.put(full_url)
except Exception as e:
print(f"Error crawling {url}: {e}")
# 並列クローリング
def start_crawling(start_url, domain, status_label):
visited.clear()
task_queue.put(start_url)
def worker():
while not task_queue.empty():
url = task_queue.get()
crawl(url, domain, status_label)
status_label.config(text="Crawling Complete")
threads = []
for _ in range(MAX_THREADS):
thread = threading.Thread(target=worker)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
# 検索機能
def search(query, results_box):
cursor.execute("SELECT url, title, description FROM pages WHERE content LIKE ?", (f"%{query}%",))
results = cursor.fetchall()
results_box.delete(*results_box.get_children()) # 結果をクリア
if not results:
messagebox.showinfo("検索結果", "該当する結果はありませんでした。")
else:
for url, title, description in results:
results_box.insert("", "end", values=(title, url, description[:100]))
# GUI設計
def create_gui():
root = tk.Tk()
root.title("クローラー型検索エンジン")
# フレーム構成
frame_crawl = ttk.Frame(root, padding=10)
frame_crawl.grid(row=0, column=0, sticky="ew")
frame_search = ttk.Frame(root, padding=10)
frame_search.grid(row=1, column=0, sticky="nsew")
# クローリングセクション
ttk.Label(frame_crawl, text="スタートURL:").grid(row=0, column=0, padx=5, pady=5, sticky="w")
url_entry = ttk.Entry(frame_crawl, width=50)
url_entry.grid(row=0, column=1, padx=5, pady=5, sticky="w")
status_label = ttk.Label(frame_crawl, text="Status: Ready", foreground="blue")
status_label.grid(row=1, column=0, columnspan=2, padx=5, pady=5, sticky="w")
def on_crawl():
start_url = url_entry.get().strip()
if not start_url:
messagebox.showerror("Error", "スタートURLを入力してください。")
return
# スキーム補完
if not start_url.startswith(("http://", "https://")):
start_url = "https://" + start_url
# ドメインを取得
domain = urlparse(start_url).netloc
status_label.config(text="Crawling in Progress...")
threading.Thread(target=start_crawling, args=(start_url, domain, status_label)).start()
ttk.Button(frame_crawl, text="クローリング開始", command=on_crawl).grid(row=0, column=2, padx=5, pady=5)
# 検索セクション
ttk.Label(frame_search, text="検索クエリ:").grid(row=0, column=0, padx=5, pady=5, sticky="w")
query_entry = ttk.Entry(frame_search, width=30)
query_entry.grid(row=0, column=1, padx=5, pady=5, sticky="w")
results_box = ttk.Treeview(frame_search, columns=("Title", "URL", "Description"), show="headings")
results_box.heading("Title", text="タイトル")
results_box.heading("URL", text="URL")
results_box.heading("Description", text="説明")
results_box.grid(row=1, column=0, columnspan=3, padx=5, pady=5, sticky="nsew")
def on_search():
query = query_entry.get().strip()
if not query:
messagebox.showerror("Error", "検索クエリを入力してください。")
return
search(query, results_box)
ttk.Button(frame_search, text="検索", command=on_search).grid(row=0, column=2, padx=5, pady=5)
# ウィンドウサイズ調整
root.columnconfigure(0, weight=1)
frame_search.rowconfigure(1, weight=1)
root.mainloop()
# GUI起動
if __name__ == "__main__":
create_gui()
# データベース接続を閉じる
conn.close()