入門ソーシャルデータ 語彙的多様性
コード
#! /usr/bin/env python # -*- coding: utf-8 -*- import twitter import json twitter_search = twitter.Twitter(domain="search.twitter.com") search_results = [] for page in range(1,6): search_results.append(twitter_search.search(q="LesPaul", rpp=100, page=page)) tweets = [ r['text'] \ for result in search_results\ for r in result['results']] words = [] for t in tweets: words += [w for w in t.split()] cnt = len(words) ucnt = len(set(words)) vocabulary = 1.0 * ucnt / cnt print "vocabulary = " + str(vocabulary) acnt = 1.0 * sum([len(t.split()) for t in tweets]) / len(tweets) print "1 tweet average word count = " + str(acnt)
実行結果
python word_count.py vocabulary = 0.663345786633 1 tweet average word count = 4.83734939759