The script is copied below. I've not actually run it, or tested it, as I don't have any HTML5 video or SRT subtitle tracks to work with, but it should work in theory. And even if it doesn't, this is a reasonable starting point for building a complete system.
  It's reasonably well commented, and should work just fine. I've not added any code to display the tracks dropdown, or to handle updating that part, and there's nothing in there to handle switching subtitle tracks, but that is a UI consideration IMHO. I may come back to this post and add that functionality at a later date. It also relies on being able to insert HTML elements inside a VIDEO tag. If that doesn't work, this could easily be changed to place an absolutely-positioned DIV over the top of the video, rather than drawing a DIV inside the video tag.
     // Get array of videos on page
  var v=document.getElementsByTagName("video");
   
  // Loop through that array
  for (var iVideos=0;iVideos<v.length;iVideos++) {
   
      // If the video has a "Track" element (or two)
      if v[iVideos].hasElement("track") {
          v[iVideos].tracks = [];
          v[iVideos].trackSelected = -1; // Change this to an index to change the displaying subtitle.
          // Get an array of tracks for each video
          var t=v[iVideos].getElementsByTagName("track");
          
          // Look through those tracks
          for (var iTracks=0;iTracks<t.length;iTracks++) {
              
              // If the kind is "Captions" or "Subtitles"
              if (t[iTracks].getAttribute("kind").toLowerCase()=="captions" || t[iTracks].getAttribute("kind").toLowerCase()=="subtitles") {
                  
                  // Add an object to the tracks array for this video object
                  var iNewPos = v[iVideos].tracks.push({
                      "label": t[iTracks].getAttribute("label"), 
                      "kind": t[iTracks].getAttribute("kind"), 
                      "src": t[iTracks].getAttribute("src"),
                      "srclang": t[iTracks].getAttribute("srclang"),
                      "element": v[iVideos]
                  });
                  
                  // Set up an async web request to fetch the text of the SRT file
                  var xmlhttp=new XMLHttpRequest();
                  if (!xmlhttp) 
                      xmlhttp = new ActiveXObject("Microsoft.XMLHTTP");
                  
                  // Store a reference to where we are, so we can work with it without having to wander through the page again
                  xmlhttp.track = v[iVideos].tracks[iNewPos];
                  xmlhttp.onreadystatechanged = new function() {
                      
                      // If we have a valid response
                      if (this.readyState == 4 && this.status == 200) {
                          
                          // Store the text, and fire off an SRT parser
                          this.track.webSRTText=responseText;
                          parseWebSRT(this.track);
                      }
                  }
                  
                  // Fire off the request asynchronously
                  xmlhttp.open("GET",t[iTracks].getAttribute("src"),true);
              }
          }
          
          // Set up a container for the subtitles
          var oSubtitleDiv = new document.createElement("div");
          oSubtitleDiv.style.position = "relative";
          oSubtitleDiv.style.bottom = "0px";
          oSubtitleDiv.style.left = "0px";
          oSubtitleDiv.style.right = "0px";
          oSubtitleDiv.cssClass = "cue";
          v[iVideos].appendChild(oSubtitleDiv);
          v[iVideos].subtitleDiv = oSubtitleDiv;
          v[iVideos].displaying = -1;
          
          // Set a function to update the subtitle container
          // Hopefully this doesn't fire TOO often, might want to only let this run every
          // x cycles or so, to ensure we don't bog the system down
          v[iVideos].timeupdate = new function() {
              if (this.trackSelected) {
                  var track = this.tracks[this.trackSelected];
                  if (track.cues) {
                      for (var iSRT=0;iSRT<track.cues.length;iSRT++) {
                          if (v[iVideos].currentTime > track.cues[iSRT].start && v[iVideos].currentTime < track.cues[iSRT].end) {
                              if (v[iVideos].displaying != iSRT) {
                                  
                                  // Remove the currently displaying contents
                                  this.subtitleDiv.innerHTML = "";
                                  
                                  // Add the lines in this SRT file
                                  for (var sLine in track.cues[iSRT].lines) {
                                      var oLine = new document.createElement("p");
                                      oLine.appendChild(document.createTextNode(sLine));
                                      this.subtitleDiv.appendChild(oLine);
                                  }
                                  this.subtitleDiv.cssClass = "cue cue" + track.cues[iSRT].cuenumber;
                                  v[iVideos].displaying = iSRT;
                              }
                          }
                      }
                  }
              }
          }
      }        
  }
   
  function parseWebSRT(track) {
      
      // Set up a variable to contain the text, with normalised line endings
      var text = track.webSRTText.replace(/(\r\n|\r|\n)/g, '\n');
      
      // Split the source into SRT blocks
      var aSRTParts = text.split("\n\n");
      
      var webSRT = [];
      
      // For each SRT part
      for (sPart in aSRTParts) {
          
          // Split it into lines
          var aSRTLines = sPart.split("\n");
          
          // Separate out the lines
          var aTimes = aSRTLines[1].split(" --> ");
          
          // Start time is the first defined
          var sStart = aTimes[0];
          
          // End time is the second defined. There may be some junk after it, separated by a space, so drop the rest
          // WARNING: This may barf, not tested
          var sEnd = aTimes[1].split(" ",2)[0];
          
          // Parse the times
          var iStart = parseSRTTime(sStart);
          var iEnd = parseSRTTime(sEnd);
          
          // Add the cue to the array
          var iNewPos = webSRT.push({
              "cuenumber": parseInt(aSRTLines[0]),
              "start": iStart,
              "end": iEnd,
              "lines": []
          })
          
          // Add the text lines to the newly-created cue object's lines property
          for (var i=2;i<aSRTLines.length;i++) {
              webSRT[iNewPos].lines.push(aSRTLines[i]);
          }
      }
      
      // Set the cues variable
      track.cues = webSRT;
  }
   
  function parseSRTTime(sTime) {
      
      // Split the time into parts, separated by colons
      aTimeParts = sTime.split(":");
      
      // The last part is the time in seconds, followed by a comma, followed by the time in ms
      // Easy way to sort that is to replace the , with a . and parse it as a float
      var iSecs = parseFloat(aTimeParts[aTimeParts.length - 1].replace(",","."));
      var iPosition = 60;
      
      // Loop through the parts BACKWARDS
      for (i=aTimeParts.length-2;i>0;i--) {
          
          // Increment the number of seconds, dependant on the position
          iSecs += parseInt(aTimeParts[i]) * iPosition;
          
          // Moving to the next highest denomenator, multiply by 60.
          // As the SRT only lists up to hours, this is a safe assumption to make
          iPosition = iPosition * 60;
      }
      
      // return the number of seconds total
      return iSecs;
  }