Add support for bot part of internal_test.py

--continuous will now run in a loop and wait for work based on a file on GCS
--bot will run the other end of the pipeline, putting work on the GCS, waiting for results

There is some sanity checking to ensure that there are no duplicate
processes running - currently the default is to simply fail and log
the status.

Current setup has excessive logging of the magic files on purpose, to
ensure that we can quickly fix any synchronization issues.

Change-Id: I5d6792a880ab66c73a6ef7ee7396cb3873e06396
diff --git a/tools/internal_test.py b/tools/internal_test.py
index 0cb9019..16ed783 100755
--- a/tools/internal_test.py
+++ b/tools/internal_test.py
@@ -4,6 +4,30 @@
 # BSD-style license that can be found in the LICENSE file.
 
 # Run all internal tests, archive result to cloud storage.
+# In the continuous operation flow we have a tester continuously checking
+# a specific cloud storage location for a file with a git hash.
+# If the file is there, the tester will remove the file, and add another
+# file stating that this is now being run. After successfully running,
+# the tester will add yet another file, and remove the last one.
+# Complete flow with states:
+# 1:
+#   BOT:
+#     Add file READY_FOR_TESTING (contains git hash)
+#     Wait until file TESTING_COMPLETE exists (contains git hash)
+#     Timeout if no progress for RUN_TIMEOUT
+#       Cleanup READY_FOR_TESTING and TESTING
+# 2:
+#   TESTER:
+#     Replace file READY_FOR_TESTING by TESTING (contains git hash)
+#     Run tests for git hash
+#     Upload commit specific logs if failures
+#     Upload git specific overall status file (failed or succeeded)
+#     Replace file TESTING by TESTING_COMPLETE (contains git hash)
+# 3:
+#   BOT:
+#     Read overall status
+#     Delete TESTING_COMPLETE
+#     Exit based on status
 
 import optparse
 import os
@@ -12,18 +36,36 @@
 import time
 import utils
 
-# How often to pull the git repo, in seconds.
-PULL_DELAY = 25
+# How often the bot/tester should check state
+PULL_DELAY = 30
 # Command timeout, in seconds.
 RUN_TIMEOUT = 3600
+# Add some extra time for the bot, since the tester might not start immediately.
+BOT_RUN_TIMEOUT = 4000
 BUCKET = 'r8-test-results'
 TEST_RESULT_DIR = 'internal'
 
+# Magic files
+READY_FOR_TESTING = 'READY_FOR_TESTING'
+TESTING = 'TESTING'
+TESTING_COMPLETE = 'TESTING_COMPLETE'
+
+ALL_MAGIC = [READY_FOR_TESTING, TESTING, TESTING_COMPLETE]
+
+# Log file names
+STDERR = 'stderr'
+STDOUT = 'stdout'
+EXITCODE = 'exitcode'
+TIMED_OUT = 'timed_out'
+
 def ParseOptions():
   result = optparse.OptionParser()
   result.add_option('--continuous',
       help='Continuously run internal tests and post results to GCS.',
       default=False, action='store_true')
+  result.add_option('--bot',
+      help='Run in bot mode, i.e., scheduling runs.',
+      default=False, action='store_true')
   result.add_option('--archive',
        help='Post result to GCS, implied by --continuous',
        default=False, action='store_true')
@@ -39,22 +81,39 @@
     print('Restarting tools/internal_test.py, content changed')
     os.execv(sys.argv[0], sys.argv)
 
-def git_pull():
+def ensure_git_clean():
   # Ensure clean git repo.
   diff = subprocess.check_output(['git', 'diff'])
   if len(diff) > 0:
     print('Local modifications to the git repo, exiting')
     sys.exit(1)
+
+def git_pull():
+  ensure_git_clean()
   subprocess.check_call(['git', 'pull'])
   return utils.get_HEAD_sha1()
 
+def git_checkout(git_hash):
+  ensure_git_clean()
+  # Ensure that we are up to date to get the commit.
+  git_pull()
+  subprocess.check_call(['git', 'checkout', git_hash])
+  return utils.get_HEAD_sha1()
+
+def get_test_result_dir():
+  return os.path.join(BUCKET, TEST_RESULT_DIR)
+
 def get_sha_destination(sha):
-  return os.path.join(BUCKET, TEST_RESULT_DIR, sha)
+  return os.path.join(get_test_result_dir(), sha)
 
 def archive_status(failed):
   gs_destination = 'gs://%s' % get_sha_destination(utils.get_HEAD_sha1())
   archive_value('status', gs_destination, failed)
 
+def get_status(sha):
+  gs_destination = 'gs://%s/status' % get_sha_destination(sha)
+  return utils.cat_file_on_cloud_storage(gs_destination)
+
 def archive_file(name, gs_dir, src_file):
   gs_file = '%s/%s' % (gs_dir, name)
   utils.upload_file_to_cloud_storage(src_file, gs_file, public_read=False)
@@ -68,30 +127,97 @@
 
 def archive_log(stdout, stderr, exitcode, timed_out, cmd):
   sha = utils.get_HEAD_sha1()
-  cmd_dir = cmd.replace(' ', '_')
+  cmd_dir = cmd.replace(' ', '_').replace('/', '_')
   destination = os.path.join(get_sha_destination(sha), cmd_dir)
   gs_destination = 'gs://%s' % destination
   url = 'https://storage.cloud.google.com/%s' % destination
   print('Archiving logs to: %s' % gs_destination)
-  archive_value('exitcode', gs_destination, exitcode)
-  archive_value('timed_out', gs_destination, timed_out)
-  archive_file('stdout', gs_destination, stdout)
-  archive_file('stderr', gs_destination, stderr)
+  archive_value(EXITCODE, gs_destination, exitcode)
+  archive_value(TIMED_OUT, gs_destination, timed_out)
+  archive_file(STDOUT, gs_destination, stdout)
+  archive_file(STDERR, gs_destination, stderr)
   print('Logs available at: %s' % url)
 
+def get_magic_file_base_path():
+  return 'gs://%s/magic' % get_test_result_dir()
+
+def get_magic_file_gs_path(name):
+  return '%s/%s' % (get_magic_file_base_path(), name)
+
+def get_magic_file_exists(name):
+  return utils.file_exists_on_cloud_storage(get_magic_file_gs_path(name))
+
+def delete_magic_file(name):
+  utils.delete_file_from_cloud_storage(get_magic_file_gs_path(name))
+
+def put_magic_file(name, sha):
+  archive_value(name, get_magic_file_base_path(), sha)
+
+def get_magic_file_content(name, ignore_errors=False):
+  return utils.cat_file_on_cloud_storage(get_magic_file_gs_path(name),
+                                         ignore_errors=ignore_errors)
+
+def print_magic_file_state():
+  print('Magic file status:')
+  for magic in ALL_MAGIC:
+    if get_magic_file_exists(magic):
+      content = get_magic_file_content(magic, ignore_errors=True)
+      print('%s content: %s' % (magic, content))
+
+def run_bot():
+  print_magic_file_state()
+  # Ensure that there is nothing currently scheduled (broken/stopped run)
+  for magic in ALL_MAGIC:
+    if get_magic_file_exists(magic):
+      print('ERROR: Synchronizing file %s exists, cleaning up' % magic)
+      delete_magic_file(magic)
+  print_magic_file_state()
+  assert not get_magic_file_exists(READY_FOR_TESTING)
+  git_hash = utils.get_HEAD_sha1()
+  put_magic_file(READY_FOR_TESTING, git_hash)
+  begin = time.time()
+  while True:
+    if time.time() - begin > BOT_RUN_TIMEOUT:
+      print('Timeout exceeded')
+      raise Exception('Bot timeout')
+    if get_magic_file_exists(TESTING_COMPLETE):
+      if get_magic_file_content(TESTING_COMPLETE) == git_hash:
+        break
+      else:
+        raise Exception('Non matching git hashes %s and %s' % (
+            get_magic_file_content(TESTING_COMPLETE), git_hash))
+    print('Still waiting for test result')
+    print_magic_file_state()
+    time.sleep(PULL_DELAY)
+  total_time = time.time()-begin
+  print('Done running test for %s in %ss' % (git_hash, total_time))
+  test_status = get_status(git_hash)
+  delete_magic_file(TESTING_COMPLETE)
+  print('Test status is: %s' % test_status)
+  if test_status != '0':
+    return 1
+
 def run_continuously():
   # If this script changes, we will restart ourselves
   own_content = get_own_file_content()
-  git_hash = utils.get_HEAD_sha1()
   while True:
     restart_if_new_version(own_content)
-    print('Running with hash: %s' % git_hash)
-    exitcode = run_once(archive=True)
-    git_pull()
-    while git_pull() == git_hash:
-      print('Still on same git hash: %s' % git_hash)
-      time.sleep(PULL_DELAY)
-    git_hash = utils.get_HEAD_sha1()
+    print_magic_file_state()
+    if get_magic_file_exists(READY_FOR_TESTING):
+      git_hash = get_magic_file_content(READY_FOR_TESTING)
+      checked_out = git_checkout(git_hash)
+      # Sanity check, if this does not succeed stop.
+      if checked_out != git_hash:
+        print('Inconsistent state: %s %s' % (git_hash, checked_out))
+        sys.exit(1)
+      put_magic_file(TESTING, git_hash)
+      delete_magic_file(READY_FOR_TESTING)
+      print('Running with hash: %s' % git_hash)
+      exitcode = run_once(archive=True)
+      print('Running finished with exit code %s' % exitcode)
+      put_magic_file(TESTING_COMPLETE, git_hash)
+      delete_magic_file(TESTING)
+    time.sleep(PULL_DELAY)
 
 def handle_output(archive, stderr, stdout, exitcode, timed_out, cmd):
   if archive:
@@ -147,17 +273,21 @@
   if execute(cmd, archive):
     failed = True
   # Ensure that all internal apps compile.
-  cmd = ['tools/run_on_app.py', '--run-all', '--out=out']
+  cmd = ['tools/run_on_app.py', '--ignore-java-version','--run-all',
+         '--out=out']
   if execute(cmd, archive):
     failed = True
   archive_status(1 if failed else 0)
+  return failed
 
 def Main():
   (options, args) = ParseOptions()
   if options.continuous:
     run_continuously()
+  elif options.bot:
+    return run_bot()
   else:
-    run_once(options.archive)
+    return run_once(options.archive)
 
 if __name__ == '__main__':
   sys.exit(Main())
diff --git a/tools/run_on_app.py b/tools/run_on_app.py
index b0ca816..c2be08e 100755
--- a/tools/run_on_app.py
+++ b/tools/run_on_app.py
@@ -49,6 +49,10 @@
                     help='Running on golem, do not build or download',
                     default=False,
                     action='store_true')
+  result.add_option('--ignore-java-version',
+                    help='Do not check java version',
+                    default=False,
+                    action='store_true')
   result.add_option('--no-libraries',
                     help='Do not pass in libraries, even if they exist in conf',
                     default=False,
@@ -149,8 +153,10 @@
       exit(exit_code)
 
 def main(argv):
-  utils.check_java_version()
   (options, args) = ParseOptions(argv)
+  if not options.ignore_java_version:
+    utils.check_java_version()
+
   if options.run_all:
     return run_all(options, args)
   return run_with_options(options, args)
diff --git a/tools/utils.py b/tools/utils.py
index d96151d..20ba093 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -106,6 +106,27 @@
   PrintCmd(cmd)
   subprocess.check_call(cmd)
 
+def delete_file_from_cloud_storage(destination):
+  cmd = ['gsutil.py', 'rm', destination]
+  PrintCmd(cmd)
+  subprocess.check_call(cmd)
+
+def cat_file_on_cloud_storage(destination, ignore_errors=False):
+  cmd = ['gsutil.py', 'cat', destination]
+  PrintCmd(cmd)
+  try:
+    return subprocess.check_output(cmd)
+  except subprocess.CalledProcessError as e:
+    if ignore_errors:
+      return ''
+    else:
+      raise e
+
+def file_exists_on_cloud_storage(destination):
+  cmd = ['gsutil.py', 'ls', destination]
+  PrintCmd(cmd)
+  return subprocess.call(cmd) == 0
+
 def download_file_from_cloud_storage(source, destination):
   cmd = ['gsutil.py', 'cp', source, destination]
   PrintCmd(cmd)