1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
| import nltk import math import string from nltk.corpus import stopwords from collections import Counter from gensim import corpora, models, matutils
text1 =""" Football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal. Unqualified, the word football is understood to refer to whichever form of football is the most popular in the regional context in which the word appears. Sports commonly called football in certain places include association football (known as soccer in some countries); gridiron football (specifically American football or Canadian football); Australian rules football; rugby football (either rugby league or rugby union); and Gaelic football. These different variations of football are known as football codes. """
text2 = """ Basketball is a team sport in which two teams of five players, opposing one another on a rectangular court, compete with the primary objective of shooting a basketball (approximately 9.4 inches (24 cm) in diameter) through the defender's hoop (a basket 18 inches (46 cm) in diameter mounted 10 feet (3.048 m) high to a backboard at each end of the court) while preventing the opposing team from shooting through their own hoop. A field goal is worth two points, unless made from behind the three-point line, when it is worth three. After a foul, timed play stops and the player fouled or designated to shoot a technical foul is given one or more one-point free throws. The team with the most points at the end of the game wins, but if regulation play expires with the score tied, an additional period of play (overtime) is mandated. """
text3 = """ Volleyball, game played by two teams, usually of six players on a side, in which the players use their hands to bat a ball back and forth over a high net, trying to make the ball touch the court within the opponents’ playing area before it can be returned. To prevent this a player on the opposing team bats the ball up and toward a teammate before it touches the court surface—that teammate may then volley it back across the net or bat it to a third teammate who volleys it across the net. A team is allowed only three touches of the ball before it must be returned over the net. """
def get_tokens(text): text = text.replace('\n', '') sents = nltk.sent_tokenize(text) tokens = [] for sent in sents: for word in nltk.word_tokenize(sent): if word not in string.punctuation: tokens.append(word) return tokens
def make_count(text): tokens = get_tokens(text) filtered = [w for w in tokens if not w in stopwords.words('english')] count = Counter(filtered) return count
def tf(word, count): return count[word] / sum(count.values())
def n_containing(word, count_list): return sum(1 for count in count_list if word in count)
def idf(word, count_list): return math.log2(len(count_list) / (n_containing(word, count_list)))
def tfidf(word, count, count_list): return tf(word, count) * idf(word, count_list)
import numpy as np
def unitvec(sorted_words): lst = [item[1] for item in sorted_words] L2Norm = math.sqrt(sum(np.array(lst)*np.array(lst))) unit_vector = [(item[0], item[1]/L2Norm) for item in sorted_words] return unit_vector
count1, count2, count3 = make_count(text1), make_count(text2), make_count(text3) countlist = [count1, count2, count3] print("Training by original algorithm......\n") for i, count in enumerate(countlist): print("Top words in document %d"%(i + 1)) scores = {word: tfidf(word, count, countlist) for word in count} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) sorted_words = unitvec(sorted_words) for word, score in sorted_words[:3]: print(" Word: %s, TF-IDF: %s"%(word, round(score, 5)))
def get_words(text): tokens = get_tokens(text) filtered = [w for w in tokens if not w in stopwords.words('english')] return filtered
count1, count2, count3 = get_words(text1), get_words(text2), get_words(text3) countlist = [count1, count2, count3]
dictionary = corpora.Dictionary(countlist) new_dict = {v:k for k,v in dictionary.token2id.items()} corpus2 = [dictionary.doc2bow(count) for count in countlist] tfidf2 = models.TfidfModel(corpus2) corpus_tfidf = tfidf2[corpus2]
print("\nTraining by gensim Tfidf Model.......\n") for i, doc in enumerate(corpus_tfidf): print("Top words in document %d"%(i + 1)) sorted_words = sorted(doc, key=lambda x: x[1], reverse=True) for num, score in sorted_words[:3]: print(" Word: %s, TF-IDF: %s"%(new_dict[num], round(score, 5))) """ 输出结果:
Training by original algorithm......
Top words in document 1 Word: football, TF-IDF: 0.84766 Word: rugby, TF-IDF: 0.21192 Word: word, TF-IDF: 0.14128 Top words in document 2 Word: play, TF-IDF: 0.29872 Word: inches, TF-IDF: 0.19915 Word: points, TF-IDF: 0.19915 Top words in document 3 Word: net, TF-IDF: 0.45775 Word: teammate, TF-IDF: 0.34331 Word: bat, TF-IDF: 0.22888
Training by gensim Tfidf Model.......
Top words in document 1 Word: football, TF-IDF: 0.84766 Word: rugby, TF-IDF: 0.21192 Word: known, TF-IDF: 0.14128 Top words in document 2 Word: play, TF-IDF: 0.29872 Word: cm, TF-IDF: 0.19915 Word: diameter, TF-IDF: 0.19915 Top words in document 3 Word: net, TF-IDF: 0.45775 Word: teammate, TF-IDF: 0.34331 Word: across, TF-IDF: 0.22888 """
|