#!/usr/bin/python
#
# RSS Subscriber Analyzer
# Written by Evan Jones in March, 2005
# http://evanjones.ca/
#
# You can do whatever you want with this program. If it breaks, you can keep
# both pieces.
#

import re
import sys

recordExpression = re.compile( r'((\d+\.\d+\.\d+\.\d+)|([0-9a-f:]+)) - (-|\w+) \[(\d+/\w+/\d+):(\d+:\d+:\d+) ([-+]\d+)\] "([^"]+)" (\d+) (\d+) "([^"]*)" "([^"]*)"' )

def parseLine( line ):
	match = recordExpression.match( line )
	if match:
		ip,ipv4,ipv6,user,date,time,timeZone,request,statusCode,bytes,referer,userAgent = match.groups()
		#~ print "IP =",ip
		#~ print "IPv4 =",ipv4
		#~ print "IPv6 =",ipv6
		#~ print "User =",user
		#~ print "Date =",date
		#~ print "Time =",time
		#~ print "Zone =",timeZone
		#~ print "Request =",request
		#~ print "Status Code =",statusCode
		#~ print "Bytes =",bytes
		#~ print "Referer =",referer
		#~ print "User Agent =",userAgent
		
		retval = {
			'ip': ip,
			'date':date,
			'time':time,
			'timeZone':timeZone,
			'request':request,
			'statusCode':statusCode,
			'bytes':bytes,
			'referer':referer,
			'userAgent':userAgent,
		}
		return retval
	else:
		print line
		raise "WTF"

hits = []

lastDay = None
users = {}
services = {}
extras = 0

extraExpression = re.compile( r'(.*); (\d+) subscribers' )

for line in sys.stdin:
	parsed = parseLine( line )
	hits.append( parsed )
	
	if parsed['date'] != lastDay and lastDay != None:
		print "%s\t%d" % (lastDay, len(users)+extras)
		users = {}
		services = {}
		extras = 0
	
	match = extraExpression.search( parsed['userAgent'] )
	if match:
		# Match services like feedburner and bloglines
		service, numSubscribers = match.groups()
		if service not in services:
			#~ print service, numSubscribers
			extras += int( numSubscribers )
			services[service] = parsed['ip']
		else:
			if services[service] != parsed['ip']:
				print "Multiple hits from %s: original ip: %s new ip: %s" % ( service, services[service], parsed['ip'] )
			
	else:
		key = ( parsed['ip'], parsed['userAgent'] )
		if key not in users:
			users[key] = 1
	
	lastDay = parsed['date']

print "%s\t%d" % (lastDay, len(users)+extras)

