Add script to parse user agents with historical data set.

(imported from commit e529c0b914ed3d3d06e9581a6239676f68c97b3f)
This commit is contained in:
Tim Abbott
2013-12-06 17:59:06 -05:00
parent 6958b63c58
commit 5c9def5be4
3 changed files with 1947 additions and 0 deletions

View File

@@ -0,0 +1,33 @@
#!/usr/bin/python
import re
from collections import defaultdict
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from zerver.lib.user_agent import parse_user_agent
user_agents_parsed = defaultdict(int)
user_agents_path = os.path.join(os.path.dirname(__file__), "user_agents_unique")
parse_errors = 0
for line in file(user_agents_path).readlines():
line = line.strip()
match = re.match('^(?P<count>[0-9]+) "(?P<user_agent>.*)"$', line)
if match is None:
print line
continue
groupdict = match.groupdict()
count = groupdict["count"]
user_agent = groupdict["user_agent"]
ret = parse_user_agent(user_agent)
if ret is None:
print "parse error", line
parse_errors += 1
continue
user_agents_parsed[ret["name"]] += int(count)
for key in user_agents_parsed:
print " ", key, user_agents_parsed[key]
print "%s parse errors!" % (parse_errors,)

1904
tools/user_agents_unique Normal file

File diff suppressed because it is too large Load Diff

10
zerver/lib/user_agent.py Normal file
View File

@@ -0,0 +1,10 @@
import re
# Warning: If you change this parsing, please test using
# tools/test_user_agent_parsing.py
# And extend tools/user_agents_unique with any new test cases
def parse_user_agent(user_agent):
match = re.match("^(?P<name>[^/ ]*[^0-9/(]*)(/(?P<version>[^/ ]*))?([ /].*)?$", user_agent)
if match is None:
return None
return match.groupdict()