Index: src/django_gheat/gheat/management/commands/import_kismet.py
===================================================================
--- src/django_gheat/gheat/management/commands/import_kismet.py	(revision 9559)
+++ src/django_gheat/gheat/management/commands/import_kismet.py	(revision 9560)
@@ -1,7 +1,9 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
 #
 # Script for importing .gpsxml and .netxml files (Kismet output)
 #
-
+# Rick van der Zwet <info@rickvanderzwet.nl>
+#
 from django.core.management.base import BaseCommand,CommandError
 from django.db.utils import IntegrityError
@@ -15,28 +17,39 @@
 import logging
 
-def import_file(gpsxml_file, netxml_file, meetrondje, kaart, gebruiker, email):
-  # TODO: Source source is variable entitity, based on mesurement
-  kaart = 'deadcode'
-  gebruiker, created = Gebruiker.objects.get_or_create(naam=gebruiker , email=email)
-  apparatuur, created = Apparatuur.objects.get_or_create(kaart=kaart)
-  # TODO: Date is set to import date, but should pick the date from the netxml file
-  mr = MeetRondje.objects.create(datum=None,
-    naam=meetrondje , gebruiker=gebruiker , apparatuur=apparatuur)
-  if not created:
-    logging.error("Meetrondje '%s' already imported" % mr)
-    sys.exit(1)
+from collections import defaultdict
 
-  open_file = lambda file: gzip.open(file,'rb') if file.endswith('.gz') else open(file,'rb')
+from import_droidstumbler import bulk_sql
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+def import_kismet(gpsxml_file, netxml_file, meetrondje):
+
+  # Open files for reading
+  def open_file(file):
+   if file.endswith('.gz'):
+     return gzip.open(file,'rb')
+   else:
+    return open(file,'rb')
   gpsxml_doc = etree.parse(open_file(gpsxml_file))
   netxml_doc = etree.parse(open_file(netxml_file))
 
+  #Various statistics
+  counters = {'meting_added' : 0, 'meting_total' : 0, 'meting_failed' : 0, 
+              'ap_added' : 0, 'ap_total' : 0, 'ap_failed' : 0}
+
+  bssid_failed = defaultdict(int)
+
+  # Prepare new accespoints and measurements
+  wnetworks = netxml_doc.findall('wireless-network')
   points = gpsxml_doc.findall('gps-point')
-  wnetworks = netxml_doc.findall('wireless-network')
+
+  # Temponary holders
+  meting_pool = defaultdict(list)
+  ap_pool = {}
 
   # Create all accesspoints and for caching validation purposes store them
   # locally as well
-  ap_cache = {}
   ap_ignore = []
-  print "#INFO: Going to import %s accesspoints" % len(wnetworks)
   for wnetwork in wnetworks:
     bssid = wnetwork.find('BSSID').text
@@ -46,82 +59,125 @@
       continue
 
-    enc = (wnetwork.find('SSID/encryption') != None)
+    encryption = (wnetwork.find('SSID/encryption') != None)
     ssid_node = wnetwork.find('SSID/essid[@cloaked="false"]')
     ssid = ssid_node.text if ssid_node != None else 'hidden'
 
-    ap, created = Accespoint.objects.get_or_create(mac=bssid, ssid=ssid, encryptie=enc)
-    ap_cache[bssid] = ap
+    counters['meting_total'] += 1
+    ap_pool[bssid] = (ssid, encryption)
 
-  count = 0
-  #XXX: This is not effient at all, try to wrap it into a a bulk insert would
-  # be much more effient as for example: http://djangosnippets.org/snippets/2362/
-  print "#INFO: Going to import %s points" % len(points)
+
   for point in points:
-    #XXX: This needs to be either the 'bssid' or the 'source', accesspoint from or too data.
+    #XXX: This needs to be either the 'bssid' or the 'source', 
+    #XXX: accesspoint from or too data.
     bssid = point.attrib['bssid']
-    # XXX: Filter this in the beginning with XPath, but etree does not support that (yet).
+    # XXX: Filter this in the beginning with XPath, but etree does not support
+    # that (yet).
     if bssid in ['GP:SD:TR:AC:KL:OG','00:00:00:00:00:00']:
       continue
     elif bssid in ap_ignore:
       continue
-    elif not ap_cache.has_key(bssid):
-      try:
-        ap = Accespoint.objects.get(mac=bssid)
-        ap_cache[bssid] = ap
-      except Accespoint.DoesNotExist:
-        print "#ERROR: Cannot found SSID for BSSID '%s'" % bssid
-        continue
-
     # XXX: Signal need properly be a relation of signal_dbm and noice_dbm
     try:
-      signaal = 100 + int(point.attrib['signal_dbm'])
+      level = point.attrib['signal_dbm']
     except KeyError:
-      print "#ERROR: Point '%s' does not have signal strengh" % point
+      logger.debug("Point '%s' does not have signal strengh" % point)
+      counters['meting_failed'] += 1
+      continue
+    # We store all values found, avg or max will be done later on
+    key = (bssid, point.attrib['lat'], point.attrib['lon'])
+    signaal=100 + int(level)
+    meting_pool[key].append(signaal)
 
-    # TODO: This also saves semi-duplicates; multiple entries with the same values, except
-    # the signal strength is different. Should get an AVG or something.
-    try:
-      meting= Meting.objects.create(meetrondje=mr, accespoint=ap_cache[bssid],
-        latitude=point.attrib['lat'], longitude=point.attrib['lon'],
-        signaal=signaal)
-    except IntegrityError, e:
+
+  # Determine which entries we need to add
+  counters['ap_total'] = len(ap_pool)
+  bssid_list_present = Accespoint.objects.filter(mac__in=ap_pool.keys()).values_list('mac', flat=True)
+  bssid_list_insert = set(ap_pool.keys()) - set(bssid_list_present)
+
+  # Create a bulk import list and import
+  if bssid_list_insert:
+    sql_values = []
+    for bssid in bssid_list_insert:
+      ssid, encryption = ap_pool[bssid]
+      # Special trick in SSID ts avoid escaping in later stage
+      item = str((bssid,ssid.replace('%','%%'),encryption))
+      sql_values.append(item)
+    counters['ap_added'] = bulk_sql('gheat_accespoint (`mac`, `ssid`, `encryptie`)',sql_values)
+
+  # Build mapping for meting import
+  mac2id = {}
+  for mac,id in Accespoint.objects.filter(mac__in=meting_pool.keys()).values_list('mac','id'):
+    mac2id[mac] = int(id)
+
+  sql_values = []
+  for (bssid,lat,lon),signals in meting_pool.iteritems():
+    if not mac2id.has_key(bssid):
+      counters['meting_failed'] += len(signals)
+      bssid_failed[bssid] += len(signals)
       continue
-    # Give some feedback to the user
-    count += 1
-    if (count % 1000) == 0:
-      sys.stdout.write(str(count))
-    elif (count % 100) == 0:
-      sys.stdout.write(".")
-    sys.stdout.flush()
+    item = str((int(meetrondje.id),mac2id[bssid],float(lat),float(lon),max(signaal)))
+    sql_values.append(item)
 
-  sys.stdout.write("%s\n" % count)
-  print "#INFO: All done, goodbye"
+  for bssid,count in sorted(bssid_failed.items(),
+      key=lambda item: item[1], reverse=True):
+    logger.debug("Missing BSSID %s found %3s times", bssid, count)
+
+
+  if sql_values:
+    counters['meting_added'] = bulk_sql('gheat_meting (`meetrondje_id`, `accespoint_id`, `lat`, `lng`, `signaal`)',sql_values)
+  return counters
 
 
 class Command(BaseCommand):
-  args = '<gpsxml>[.gz] [<netxml>[.gz]]'
+  args = '<gpsxml>[.gz] [gpsxml2[.gz]  gpsxml3[.gz] ...]'
   option_list = BaseCommand.option_list + (
-    make_option('-m', '--meetrondje', dest='meetrondje', default='rondje',help='Naam van het meetrondje'),
     make_option('-k', '--kaart', dest='kaart', default='onbekend', help="Kaart gebruikt"),
+    make_option('-m', '--meetrondje', dest='meetrondje', default=None),
     make_option('-g', '--gebruiker', dest='gebruiker', default='username',help='Naam van de persoon die de meting uitgevoerd heeft'),
     make_option('-e', '--email', dest='email', default='foo@bar.org',help='Email van de persoon die de meting uitgevoerd heeft'),
-    )
+    make_option('-d', '--datum', dest='datum', default=None, help="Provide date  \
+      in following format: '%Y%m%d-%H-%M-%S-1', by default it will be generated from \
+      the filename"),
+  )
 
   def handle(self, *args, **options):
-    try:
-      if len(args) == 2:
-        (gpsxml_file, netxml_file) = args
-      elif len(args) == 1:
-        (gpsxml_file,) = args
-        netxml_file = gpsxml_file.replace('.gpsxml','.netxml')
-      else:
-        raise ValueError
-    except ValueError:
+    if len(args) == 0:
       self.print_help(sys.argv[0],sys.argv[1])
       raise CommandError("Not all arguments are provided")
-    if not os.path.isfile(gpsxml_file):
-      raise CommandError("gpsxml file '%s' does not exists" % gpsxml_file)
-    if not os.path.isfile(netxml_file):
-      raise CommandError("netxml file '%s' does not exists" % netxml_file)
 
-    import_file(gpsxml_file, netxml_file ,options['meetrondje'], options['kaart'],options['gebruiker'],options['email'])
+    for gpsxml_file in args:
+      if not os.path.isfile(gpsxml_file):
+        raise CommandError("gpsxml file '%s' does not exists" % gpsxml_file)
+
+      netxml_file = gpsxml_file.replace('.gpsxml','.netxml')
+      if not os.path.isfile(netxml_file):
+        raise CommandError("correlated netxml file '%s' does not exists" % netxml_file)
+
+      logger.info("Processing '%s'" % gpsxml_file)
+      if options['datum'] == None:
+         datum = os.path.basename(gpsxml_file).lstrip('Kismet-').rstrip('.gpsxml.gz')
+      else:
+         datum = options['datum']
+      try:
+         # Kismet-20110805-15-37-30-1
+         datum = datetime.datetime.strptime(datum,'%Y%m%d-%H-%M-%S-1')
+      except ValueError:
+        raise CommandError("Invalid date '%s'" % options['datum'])
+
+      # Meetrondje from filename if needed
+      if options['meetrondje'] == None:
+        meetrondje = os.path.basename(gpsxml_file).rstrip('.gz').rstrip('.gpsxml')
+      else:
+        meetrondje = options['meetrondje']
+
+      # Create meetrondje object
+      g, created = Gebruiker.objects.get_or_create(naam=options['gebruiker'] , email=options['email'])
+      a, created = Apparatuur.objects.get_or_create(kaart=options['kaart'])
+      mr, created = MeetRondje.objects.get_or_create(datum=datum , naam=meetrondje , gebruiker=g , apparatuur=a)
+      logger.info('Meetrondje: %s @ %s' % (meetrondje, datum))
+      if not created:
+        logger.error("Meetrondje '%s' already imported" % mr)
+        sys.exit(1)
+      counters = import_kismet(gpsxml_file, netxml_file, mr)
+      logger.info("summary accespoints: total:%(ap_total)-6s added:%(ap_added)-6s failed:%(ap_failed)-6s" % counters)
+      logger.info("summary metingen   : total:%(meting_total)-6s added:%(meting_added)-6s failed:%(meting_failed)-6s" % counters)
