import json import re def load_gmail(): with open('ham_emails.json', 'r') as file: data1 = json.load(file) with open('spam_emails.json', 'r') as file: data2 = json.load(file) data = data1+data2 return data def preprocess(text): text = text.lower() text = re.sub(r"[^/w/s]", " ", text) return text.split() def train_bayes(train_data, alpha = 1.0): class_count = {} #ile mamy maili o danej etykiecie ham:40 word_count = {} #ile mamy każdego słowa dl akżdej kategori ham: specjalna:3, oferta:5, total_words = {} #ile mamy łąxznie wsłów dla każdej etykiety ham: 200 for rec in train_data: label = rec["label"] class_count[label] = class_count.get(label, 0) + 1 word_count.setdefault(label, {}) total_words.setdefault(label, 0) words = preprocess(rec["text"]) for word in words: total_words[label] += 1 word_count[label][word] = word_count[label].get(word, 0) + 1 vocab = set() #unikatowe etykiety for wc in word_count.values(): vocab.update(wc.keys()) return { "class_count": class_count, "word_count": word_count, "total_words": total_words, "vocab": total_words, "alpha": alpha, "total_docs": len(train_data) }