This page is part of the MultiProbe project.
Return to the previous page.
#! /usr/bin/env python

"""
NAME
    ParseMachineDirs -- process the passive-start measurement files
                        from the Delft BitTorrent Measurements 2 dataset

DESCRIPTION
    Process the passive-start measurement files from the 
    Delft BitTorrent Measurements 2 dataset [http://multiprobe.ewi.tudelft.nl].
    Note that correlating this information, or doing anything useful with
    it, is the task of the user. Here, we only provide a bare bones script
    for proving the concept.

SYNOPSIS
    %(progname)s [args]
             
ARGUMENTS
    
    -h, --help
        display this help screen
    
    -d <dir name>,--dirs=<dirname>
        process files in all given directories [default ./]

    Flags:
    -v, --verbose
        display verbose info
        
    -r, --recursive
        parse recursiverly the input directories; 
        create output directories to match the parsed structure
        

CONTACT INFO
    
    To contact us, please send an email to 
    
        Alexandru IOSUP <A.Iosup at ewi.tudelft.nl>
        
    Please report bugs to the above email address, but prefix your subject 
    line with "[MultiProbe][BUG]" (quotes should be omitted). 

    
COPYRIGHT & LICENSE

    copyright © 2005 Alexandru Iosup. All rights reserved.
    
    Permission to use, modify, or distribute is granted for
    all academic use. For commercial use, a written agreement 
    from the copyright holder is required.
"""

__author__ = 'Alexandru Iosup';
__email__ = 'A.Iosup at ewi.tudelft.nl';
__file__ = 'ParseMachineDirs.py';
__version__ = '$Revision: 0.1$';
__date__ = "$Date: 2005/05/11 16:00:10 $"
__copyright__ = "Copyright (c) 2005 Alexandru IOSUP"
__license__ = "Python (can use, modify, or distribute freely)"

# Machine directory structure
# machinename/ (e.g. blast.uwaterloo.ca)
#   datetime/  (YYYY-DD-MM; e.g., 2005-09-05_11-37)
#     base_data/
#        trackingstats
#        trackinginfo
#        edges_cache.dat
#        destinations.dat
#        sources.dat
#     torrent_data/
#        TorrentXXXXXXXX-LP.err
#        TorrentXXXXXXXX-LP.res
#        buffer.err
#        buffer.res
#        tracker.err
#        tracker.res
#     batchXXXXXXXX/
#        ipaddresses
#        new_edges_cache.dat
#        paths/
#           sources-IP.dat (e.g., 234.123.4.5.dat)
#           path_IP1-IP2   (e.g., path_123.45.67.8-234.123.4.5)
#                                      


import sys
import os
import getopt
import string
import time
import glob
import re
import traceback
import StringIO


__verbose = 0

IPMatcher = re.compile(r'(?P<first3>\d{1,3}\.\d{1,3}\.\d{1,3}\.)(?P<last>\d{1,3})')
IPPlusMatcher = re.compile(r'(?P<first3>\d{1,3}\.\d{1,3}\.\d{1,3}\.)(?P<last>\d{1,3})(?P<plus>[\+]*)')

def parseP01( line ):
    try:
        line = line.strip()
        TS,text,extra = line.split('\t',2)
        if text == 'IP':
            BatchNo,FileName,IP,Port,Unique = extra.split('\t')
            print TS,text,BatchNo,FileName,IP,Port,Unique
        elif text == 'E-IP':
            BatchNo,IP = extra.split('\t')
            print TS,text,BatchNo,IP
        elif text == 'E-DATA':
            BatchNo,IP1,IP2 = extra.split('\t')
            print TS,text,BatchNo,IP1,IP2
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()

def parseP02( line ):
    try:
        line = line.strip()
        TS,text,extra = line.split('\t',2)
        if text == 'BATCH':
            BatchNo,NBatchIPs,NUniqueBatchIPs = extra.split('\t')
            print TS,text,BatchNo,NBatchIPs,NUniqueBatchIPs
        elif text == 'EDGES':
            BatchNo,NoDestinations,NPackets = extra.split('\t')
            print TS,text,BatchNo,NoDestinations,NPackets
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()    
    
def parseP03( line ):
    try:
        IP = line.strip()
        print IP
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()
    
def parseP04( line ):
    try:
        line = line.strip()
        IP1Plus,IP2Plus = line.split(' ')
        print ' '.join([IP1Plus,IP2Plus])
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()
    
def parseP05( line ):
    try:
        IP = line.strip()
        print IP
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()

def parseP06( line ):
    print line
    
def parseP07( line ):
    print line
    
def parseP08( line ):
    print line
    
def parseP09( line ):
    try:
        line = line.strip()
        if line[0] == '#':
            if line.find('PeerList') >= 0:
                try:
                    line = line + ' '
                    hash,text,IP,Port,extra = line.split(' ',4)
                    print IP,Port
                except:
                    print "ERROR!", "line=", "'"+line+"'"
                    print ">>>", traceback.print_exc()
        else:
            TS,IP,extra = line.split(' ',2)
            print TS,IP
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()
        
def parseP10( line ):
    print line
    
def parseP11( line ):
    print line
    
def parseP12( line ):
    try:
        IP = line.strip()
        print IP
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()
        
def parseP13( line ):
    print line
    
def parseP14( line ):
    print line

def parseP15( line ):
    try:
        line = line.strip()
        IP1Plus,IP2Plus = line.split(' ')
        print IP1Plus,IP2Plus
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()
    
def parseP16( line ):
    try:
        line = line.strip()
        if line[0].isdigit():
            if line.find('unresponsive') >= 0:
                print line # unresponsive hop line
            else:
                hop,IP,extra = line.split(' ',2)
                print hop,IP,extra
        else:
            print "ERROR LINE:", line
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()

def parseP17( line ):
    try:
        IP = line.strip()
        print IP
    except:
        print "Exception!!!! line=", "'"+line+"'"
        print ">>>", traceback.print_exc()

    
class PBaseFilters:
    # Dir -- a file system filter for finding all directories of this type
    Dir = "base_data"
    # Files -- dictionary holding for each file type 
    #          its code name, its file system filter, and 
    #          a pointer to a function that can parse a line from the file
    Files = [
        {'name':'P01', 'filter':"trackinginfo", 'parser': parseP01},
        {'name':'P02', 'filter':"trackingstats", 'parser': parseP02},
        {'name':'P03', 'filter':"destinations.dat", 'parser': parseP03},
        {'name':'P04', 'filter':"edges_cache.dat", 'parser': parseP04},
        {'name':'P05', 'filter':"sources.dat", 'parser': parseP05}
        ]
    
class PTorrentFilters:
    # Dir -- a file system filter for finding all directories of this type
    Dir = "torrent_data"
    # Files -- dictionary holding for each file type 
    #          its code name, its file system filter, and 
    #          a pointer to a function that can parse a line from the file
    Files = [ 
        {'name':'P06', 'filter':"buffer.err", 'parser': parseP06},
        {'name':'P07', 'filter':"buffer.res", 'parser': parseP07},
        {'name':'P08', 'filter':"Torrent*.err", 'parser': parseP08},
        {'name':'P09', 'filter':"Torrent*.res", 'parser': parseP09},
        {'name':'P10', 'filter':"tracker.err", 'parser': parseP10},
        {'name':'P11', 'filter':"tracker.res", 'parser': parseP11} 
        ]
    
class PBatchFilters:
    # Dir -- a file system filter for finding all directories of this type
    Dir = "batch*"
    # Files -- dictionary holding for each file type 
    #          its code name, its file system filter, and 
    #          a pointer to a function that can parse a line from the file
    Files = [
        {'name':'P12', 'filter':"ipaddresses", 'parser': parseP12},
        {'name':'P13', 'filter':"measure_edges.err", 'parser': parseP13},
        {'name':'P14', 'filter':"measure_edges.res", 'parser': parseP14},
        {'name':'P15', 'filter':"new_edges_cache.dat", 'parser': parseP15},
        {'name':'P16', 'filter':os.path.join("paths","path_*"), 'parser': parseP16},
        {'name':'P17', 'filter':os.path.join("paths","sources-*"), 'parser': parseP17}
        ]

    
def processFile(InFileName, parser):
    global __verbose
    if __verbose>0:print "[start] processing", InFileName
    
    #-- process file
    InFile = open(InFileName, "rt")
    while 1:
        lines = InFile.readlines(10000000) # 10MB buffer
        if not lines: break
        
        #-- process lines
        for line in lines: 
            if (len(line) > 0):
                parser(line)
                
    InFile.close()
    
    if __verbose>0:print "[done] processing", InFileName

    
def processTextAndBinaryFile(InFileName, parser):
    global __verbose
    if __verbose>0:print "[start] processing", InFileName
    
    #-- process file
    InFile = open(InFileName, "rb")
    Data = InFile.read()
    InFile.close()
    DataSize = len(Data)
    
    OneStream = StringIO.StringIO()
    for c in Data:
        ic = ord(c)
        if (ic < 32 or ic >= 127) and (c not in ['\n','\r','\t']):
            OneStream.write('#')
        else:
            OneStream.write(c)
            
    Data = OneStream.getvalue()
            
    InFile = StringIO.StringIO(Data)
    while 1:
        lines = InFile.readlines(10000000) # 10MB buffer
        if not lines: break
        
        #-- process lines
        for line in lines: 
            if (len(line) > 0):
                parser(line)
                
    InFile.close()
    
    if __verbose>0:print "[done] processing", InFileName

    
def parseSubDir(InDir, SubDirInfo):
    global __verbose
    
    DirFilter = os.path.join( os.path.abspath(InDir), SubDirInfo.Dir )
    DirNamesList = glob.glob(DirFilter) 
    for DirName in DirNamesList:
        if os.path.isdir(DirName) and not os.path.islink(DirName): # base_data exists
            if __verbose>0:print "found directory", DirName
            for FileInfo in SubDirInfo.Files:
                FullFilter = os.path.join( os.path.abspath(DirName), FileInfo['filter'] )
                FileNames = glob.glob(FullFilter)
                for FileName in FileNames:
                    if os.path.isfile(FileName):
                        if __verbose>0:print "processing file", FileName
                        processFile(FileName, FileInfo['parser'])

    
def parseJustThisDir(InDir): 
    global __verbose 
    
    #-- parse base_data
    parseSubDir(InDir, PBaseFilters)
    
    #-- parse torrent_data
    parseSubDir(InDir, PTorrentFilters)
    
    #-- parse batch*data
    parseSubDir(InDir, PBatchFilters)
    
        
def parseInDir(InDir, RecursiveFlag, CurrentLevel = 0, MaxLevel = 2):
    
    # search for subdirectories if needed
    if RecursiveFlag == 1 and CurrentLevel <= MaxLevel:
        FileList = os.listdir (InDir)
        for FileName in FileList:
            DirName = os.path.join(os.path.abspath(InDir), FileName)
            if os.path.isdir(DirName) and not os.path.islink(DirName): 
                #subdir -> search
                parseInDir(DirName, RecursiveFlag, CurrentLevel + 1, MaxLevel)
                
    #-- parse the current dir
    parseJustThisDir(InDir) 
    
def usage(progname):
  print __doc__ % vars()

def main(argv):

    try:                                
        opts, args = getopt.getopt(argv, "hvrd:", ["help", "verbose", "recursive", "dirs="])
    except getopt.GetoptError:
        usage(os.path.basename(sys.argv[0]))
        sys.exit(2)
        
    InDirs = []
    
    RecursiveFlag = 0
    __verbose = 0
    for opt, arg in opts:
        if opt in ["-h", "--help"]:
            usage()
            sys.exit()
        elif opt in ['-v', "--verbose"]:
            __verbose = 1
            print "***VERBOSE***"
        elif opt in ["-r", "--recursive"]: 
            RecursiveFlag = 1 
        elif opt in ["-d", "--dir"]: 
            try:
                List = arg.split(',')
                for Dir in List:
                    if Dir not in InDirs:
                        InDirs.append(Dir)
            except:
                print "WARNING!", "Wrong directory list", arg, "omitting!"
                pass
                
    if len(InDirs) == 0: InDirs = ['.']
            
    for InDir in InDirs:
        if __verbose>0:print "processing of directory", InDir, "started at", time.strftime('%Y-%d-%m %H:%M:%S', time.localtime())
        parseInDir(InDir, RecursiveFlag)
        if __verbose>0:print "processing of directory", InDir, "ended at", time.strftime('%Y-%d-%m %H:%M:%S', time.localtime())

if __name__ == "__main__":
    
    if len(sys.argv) == 1:
        usage(os.path.basename(sys.argv[0]))
        sys.exit(3)

    main(sys.argv[1:])