devserver: Fix handling of stale / max update responses This fixes two shortcomings related to handling of multiple concurrent update requests: 1) Devserver would update the number of requests handled as soon as a requests was received, instead of prior to returning the response. Since obtaining the response could take a while (e.g. request to a remote devserver over a flaky network connection), the client might time out and send another request, which would be unjustly rejected. Further, the checking and updating of the counter was not atomic, so race conditions might have occurred. These are fixed by moving the counter update to the latest point possible, and doing it atomically. 2) When a response takes a while to handle, the client might time out and send a new request. In the event this happens, we want the server to realize that it should not return a response for the first request nor count it against the maximum allowed. This problem is not easy to fix completely, as there is no deterministic way to know whether or not the client has timed out (keep in mind that the max-allowed mechanism is a simplistic workaround for a more elaborate rule-based response mechanism, for testing purposes). We therefore *minimize* the risk by checking, as close as possible to where we "commit" to sending a response, whether a newer request was received. There is still a small chance that a unique and rare timing of events will cause this problem to surface. BUG=chromium:321272 TEST=Unit tests. Change-Id: I2e9e31b3e9c8c28d8d3e39c0c7d342c102243433 Reviewed-on: https://chromium-review.googlesource.com/179083 Tested-by: Gilad Arnold <garnold@chromium.org> Reviewed-by: Chris Sosa <sosa@chromium.org> Commit-Queue: Gilad Arnold <garnold@chromium.org>

commit: d0c7175608ba293d1639e9baca9f7d7f84c4e137 [log] [tgz]
author: Gilad Arnold <garnold@chromium.org> Fri Dec 06 11:48:45 2013 -0800
committer: chrome-internal-fetch <chrome-internal-fetch@google.com> Mon Dec 09 19:25:02 2013 +0000
tree: fe0152669f8dc15ee0b19ec2ee69fc2ed492932e
parent: 9fa4bde4d41a1f9cea0b1a7ce1472299f221ca96 [diff] [blame]
diff --git a/autoupdate.py b/autoupdate.py
index fe930cb..f91811f 100644
--- a/autoupdate.py
+++ b/autoupdate.py

@@ -5,9 +5,11 @@
 import base64
 import json
 import os
+import random
 import struct
 import subprocess
 import sys
+import threading
 import time
 import urllib2
 import urlparse
@@ -199,6 +201,9 @@
     # host, as well as a dictionary of current attributes derived from events.
     self.host_infos = HostInfoTable()
 
+    self.curr_request_id = -1
+    self._update_response_lock = threading.Lock()
+
   @classmethod
   def _ReadMetadataFromStream(cls, stream):
     """Returns metadata obj from input json stream that implements .read()."""
@@ -803,14 +808,18 @@
              forced_update_label)
       label = forced_update_label
 
+    # Make sure that we did not already exceed the max number of allowed
+    # responses; note that this is merely an optimization, as the definitive
+    # check and updating of the counter is done later, right before returning a
+    # response.
     if self.max_updates == 0:
-      # In case max_updates is used, return no response if max reached.
       _Log('Request received but max number of updates handled')
       return autoupdate_lib.GetNoUpdateResponse(protocol)
 
-    _Log('Update Check Received. Client is using protocol version: %s',
-         protocol)
-    self.max_updates -= 1
+    request_id = random.randint(0, sys.maxint)
+    self.curr_request_id = request_id
+    _Log('Update Check Received (id=%d). Client is using protocol version: %s',
+         request_id, protocol)
 
     # Finally its time to generate the omaha response to give to client that
     # lets them know where to find the payload and its associated metadata.
@@ -854,12 +863,38 @@
     if self.public_key:
       public_key_data = base64.b64encode(open(self.public_key, 'r').read())
 
-    _Log('Responding to client to use url %s to get image', url)
-    return autoupdate_lib.GetUpdateResponse(
+    update_response =  autoupdate_lib.GetUpdateResponse(
         metadata_obj.sha1, metadata_obj.sha256, metadata_obj.size, url,
         metadata_obj.is_delta_format, metadata_obj.metadata_size,
         signed_metadata_hash, public_key_data, protocol, self.critical_update)
 
+    # Make sure we can proceed with the response (critical section).
+    with self._update_response_lock:
+      # If the number of responses sent already exceeds the max allowed, abort.
+      if self.max_updates == 0:
+        _Log('Max allowed number of update responses already sent, '
+             'aborting this one (id=%d)', request_id)
+        return autoupdate_lib.GetNoUpdateResponse(protocol)
+
+      # If there's been a more recent request, we assume the client timed out
+      # on this request and should not respond to it.
+      # IMPORTANT: we want to do this as close as posible to where we commit to
+      # making a response, i.e. right before we update the response tally. This
+      # is why this check happens in the critical section.
+      curr_request_id = self.curr_request_id
+      if curr_request_id != request_id:
+        _Log('A more recent request was received (id=%d), aborting this one '
+             '(id=%d)', curr_request_id, request_id)
+        return autoupdate_lib.GetNoUpdateResponse(protocol)
+
+      # Update the counter, committing to make the response.
+      self.max_updates -= 1
+
+    # At this point, we're good to go with the response.
+    _Log('Responding to client to use url %s to get image (id=%d)', url,
+         request_id)
+    return update_response
+
   def HandleHostInfoPing(self, ip):
     """Returns host info dictionary for the given IP in JSON format."""
     assert ip, 'No ip provided.'
commit	d0c7175608ba293d1639e9baca9f7d7f84c4e137	[log] [tgz]
author	Gilad Arnold <garnold@chromium.org>	Fri Dec 06 11:48:45 2013 -0800
committer	chrome-internal-fetch <chrome-internal-fetch@google.com>	Mon Dec 09 19:25:02 2013 +0000
tree	fe0152669f8dc15ee0b19ec2ee69fc2ed492932e
parent	9fa4bde4d41a1f9cea0b1a7ce1472299f221ca96 [diff] [blame]