輸入文檔的部分則以 python list 表示,每個元素表示一個文檔,先以簡單的英文文字示範,文檔如下。
1 2 3 4 5 6 7
documents = [ "I want to adopt the dog", "a apple a day keeps doctor away", "I have a pen I have an apple", "who is your daddy", "daddy daddy daddy daddy daddy" ]
def_document_to_freq_dict(self, document): returndict([(word, document.count(word)) for word in document]) def_count_doc_freq_by_word(self, word): doc_count = 0 for document in self.documents: if word in document: doc_count += 1 return doc_count defcompute_tf_idf(self, word, document): freq_dict = self._document_to_freq_dict(document) tf = freq_dict.get(word, 0) / sum(freq_dict.values()) idf = math.log10(len(self.documents) / (1 + self._count_doc_freq_by_word(word))) return tf * idf
deftokenize(documents): documents = [document.split(" ") for document in documents] return documents
defmain(): documents = [ "I want to adopt the dog", "a apple a day keeps doctor away", "I have a pen I have an apple", "who is your daddy", "daddy daddy daddy daddy daddy" ] # 分詞前處理 documents = tokenize(documents)