{"cells":[{"cell_type":"code","source":["import json\n","import random\n","import math\n","\n","# ------------------------------\n","# 1. Wczytanie danych treningowych\n","# ------------------------------\n","def load_email_data(spam_path, ham_path):\n","    with open(spam_path, encoding=\"utf-8\") as f1, open(ham_path, encoding=\"utf-8\") as f2:\n","        spam_data = json.load(f1)\n","        ham_data = json.load(f2)\n","    return spam_data + ham_data\n","\n","\n","def train_test_split(data, test_ratio=0.2):\n","    random.shuffle(data)\n","    cut = int(len(data) * (1 - test_ratio))\n","    return data[:cut], data[cut:]\n"],"metadata":{"id":"vOgNA2SVBlDA"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["\n","# ------------------------------\n","# 2. Trenowanie klasyfikatora Bayesa\n","# ------------------------------\n","def preprocess(text):\n","    return text.lower().replace(\"–\", \" \").replace(\"-\", \" \").replace(\",\", \" \")\\\n","        .replace(\".\", \" \").replace(\"!\", \" \").replace(\"?\", \" \").split()\n","\n","def train_naive_bayes(train_data, alpha=1.0):\n","    class_counts = {}\n","    word_counts = {}\n","    total_words = {}\n","\n","    for rec in train_data:\n","        label = rec[\"label\"]\n","        class_counts[label] = class_counts.get(label, 0) + 1\n","        word_counts.setdefault(label, {})\n","        total_words.setdefault(label, 0)\n","\n","        words = preprocess(rec[\"text\"])\n","        for word in words:\n","            word_counts[label][word] = word_counts[label].get(word, 0) + 1\n","            total_words[label] += 1\n","\n","    vocab = set()\n","    for wc in word_counts.values():\n","        vocab.update(wc.keys())\n","\n","    return {\n","        \"class_counts\": class_counts,\n","        \"word_counts\": word_counts,\n","        \"total_words\": total_words,\n","        \"vocab\": vocab,\n","        \"alpha\": alpha,\n","        \"total_docs\": len(train_data)\n","    }\n","\n"],"metadata":{"id":"Qhn3rY-HDD7B"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# ------------------------------\n","# 3. Klasyfikacja wiadomości\n","# ------------------------------\n","def log_prob(model, words, class_name):\n","    logp = math.log(model[\"class_counts\"][class_name] / model[\"total_docs\"])\n","    V = len(model[\"vocab\"])\n","    a = model[\"alpha\"]\n","    for word in words:\n","        wc = model[\"word_counts\"][class_name].get(word, 0)\n","        logp += math.log((wc + a) / (model[\"total_words\"][class_name] + a * V))\n","    return logp\n","\n","def predict(model, text):\n","    words = preprocess(text)\n","    best_class, best_log = None, -float(\"inf\")\n","    for c in model[\"class_counts\"]:\n","        lp = log_prob(model, words, c)\n","        if lp > best_log:\n","            best_class, best_log = c, lp\n","    return best_class\n","\n","def evaluate_model(model, test_data):\n","    correct = 0\n","    for rec in test_data:\n","        prediction = predict(model, rec[\"text\"])\n","        if prediction == rec[\"label\"]:\n","            correct += 1\n","    accuracy = correct / len(test_data)\n","    print(f\"Skuteczność na zbiorze testowym: {accuracy * 100:.2f}%\")\n","    return accuracy"],"metadata":{"id":"A-nSjvVJDI6V"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#kod do uruchomienia lokalnie na komputerze\n","import os\n","import pickle\n","from google_auth_oauthlib.flow import InstalledAppFlow\n","from googleapiclient.discovery import build\n","\n","SCOPES = ['https://www.googleapis.com/auth/gmail.modify']\n","\n","def authorize_and_save_token():\n","    if os.path.exists('token.pkl'):\n","        print(\"Plik token.pkl już istnieje.\")\n","        return\n","\n","    flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)\n","    creds = flow.run_local_server(port=0)\n","\n","    with open('token.pkl', 'wb') as token_file:\n","        pickle.dump(creds, token_file)\n","    print(\"Autoryzacja zakończona i token zapisany jako token.pkl.\")\n","\n","    service = build('gmail', 'v1', credentials=creds)\n","    results = service.users().labels().list(userId='me').execute()\n","    print(\"Twoje etykiety Gmail:\")\n","    for label in results.get('labels', []):\n","        print(\"•\", label['name'])\n","\n","if __name__ == '__main__':\n","    authorize_and_save_token()\n","\n","\n","\n"],"metadata":{"id":"_lHbbpWnfkhy"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import base64\n","import pickle\n","from google.auth.transport.requests import Request\n","from googleapiclient.discovery import build\n","\n","\n","def get_gmail_service():\n","    with open(\"token.pkl\", \"rb\") as token:\n","        creds = pickle.load(token)\n","\n","    if creds and creds.expired and creds.refresh_token:\n","        creds.refresh(Request())\n","\n","    service = build(\"gmail\", \"v1\", credentials=creds)\n","    return service\n","\n","\n","def fetch_unread_emails_from_label(model, label_name='Test'):\n","    service = get_gmail_service()\n","\n","    label_test = get_label_id(service, label_name)\n","    label_ham = get_label_id(service, 'ham')\n","    label_spam = get_label_id(service, 'spam2')\n","\n","    if not label_test:\n","        print(f\"Etykieta '{label_name}' nie została znaleziona.\")\n","        return []\n","\n","    response = service.users().messages().list(\n","        userId='me',\n","        labelIds=[label_test, 'UNREAD'],\n","        maxResults=100\n","    ).execute()\n","\n","    messages = response.get('messages', [])\n","    email_list = []\n","\n","    for msg in messages:\n","        msg_id = msg['id']\n","        message = service.users().messages().get(userId='me', id=msg_id, format='full').execute()\n","        payload = message.get('payload', {})\n","        headers = payload.get('headers', [])\n","\n","        subject = next((h['value'] for h in headers if h['name'] == 'Subject'), '')\n","        body = get_message_body(payload)\n","        full_text = f\"{subject} {body.strip()}\"\n","\n","        prediction = predict(model, full_text)\n","\n","        add_labels = [label_spam if prediction == 'spam' else label_ham]\n","        remove_labels = [label_test]\n","\n","        service.users().messages().modify(\n","            userId='me',\n","            id=msg_id,\n","            body={\n","                'addLabelIds': add_labels,\n","                'removeLabelIds': remove_labels\n","            }\n","        ).execute()\n","\n","        print(f\"[ZMIANA] Wiadomość '{subject[:40]}...' → {prediction.upper()} (etykieta zmieniona)\")\n","\n","\n","\n","def get_label_id(service, label_name):\n","    labels = service.users().labels().list(userId='me').execute().get('labels', [])\n","    for label in labels:\n","        if label['name'].lower() == label_name.lower():\n","            return label['id']\n","    return None\n","\n","\n","def get_message_body(payload):\n","    parts = payload.get('parts')\n","    if parts:\n","        for part in parts:\n","            if part['mimeType'] == 'text/plain':\n","                data = part['body'].get('data')\n","                if data:\n","                    return base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')\n","    else:\n","        body_data = payload['body'].get('data')\n","        if body_data:\n","            return base64.urlsafe_b64decode(body_data).decode('utf-8', errors='ignore')\n","    return \"(brak treści)\"\n","\n"],"metadata":{"id":"bDE4LizJnZS-"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# ------------------------------\n","# 4. Główna funkcja\n","# ------------------------------\n","from pprint import pprint\n","\n","def main():\n","    data = load_email_data(\"spam_emails.json\", \"ham_emails.json\")\n","\n","    train, test = train_test_split(data)\n","\n","    model = train_naive_bayes(train)\n","\n","    evaluate_model(model, test)\n","\n","    fetch_unread_emails_from_label(model, label_name='Test')\n","\n","# ------------------------------\n","# 5. Uruchomienie\n","# ------------------------------\n","if __name__ == \"__main__\":\n","    main()\n","\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qOyG62y9DMi1","executionInfo":{"status":"ok","timestamp":1747780865082,"user_tz":-120,"elapsed":8811,"user":{"displayName":"Łukasz Pawlowski","userId":"08418866654738771767"}},"outputId":"e93eaa6a-81c4-46ec-93a4-9443582d464e"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Skuteczność na zbiorze testowym: 90.48%\n","[ZMIANA] Wiadomość 'Twoje konto zostało zablokowane...' → SPAM (etykieta zmieniona)\n","[ZMIANA] Wiadomość 'Lista zakupów...' → HAM (etykieta zmieniona)\n","[ZMIANA] Wiadomość 'Zarabiaj 7000 zł tygodniowo!...' → SPAM (etykieta zmieniona)\n","[ZMIANA] Wiadomość 'Plan spotkania zespołu...' → HAM (etykieta zmieniona)\n","[ZMIANA] Wiadomość 'Testowy spam...' → SPAM (etykieta zmieniona)\n","[ZMIANA] Wiadomość 'Test...' → HAM (etykieta zmieniona)\n"]}]}],"metadata":{"colab":{"provenance":[{"file_id":"1cokMlQIQ9exQFDec8s44xi1ilHtD4CaV","timestamp":1747899172257},{"file_id":"1SGFz8WH-PhdgXvSxpq4vrp5cZvwUaD6j","timestamp":1747757956790}]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}