Recently I read a rant about how HTML5 doesn't have closed caption support. This is true, but it's not so much of a hurdle as you might think. In a few hours I have worked out (mostly) a system that would work exactly the same way as the proposed extension (Barring the CSS wizardry requested – I'm not sure how to handle the requested pseudo-element selectors, so this one uses nice, simple CSS classes), all in nice, cross-platform JavaScript.
The script is copied below. I've not actually run it, or tested it, as I don't have any HTML5 video or SRT subtitle tracks to work with, but it should work in theory. And even if it doesn't, this is a reasonable starting point for building a complete system.
It's reasonably well commented, and should work just fine. I've not added any code to display the tracks dropdown, or to handle updating that part, and there's nothing in there to handle switching subtitle tracks, but that is a UI consideration IMHO. I may come back to this post and add that functionality at a later date. It also relies on being able to insert HTML elements inside a VIDEO tag. If that doesn't work, this could easily be changed to place an absolutely-positioned DIV over the top of the video, rather than drawing a DIV inside the video tag.
// Get array of videos on page
  var v=document.getElementsByTagName("video");
// Loop through that array
  for (var iVideos=0;iVideos<v.length;iVideos++) {
    // If the video has a "Track" element (or two)
  if v[iVideos].hasElement("track") {
v[iVideos].tracks = [];
        v[iVideos].trackSelected = -1; // Change this to an index to change the displaying subtitle.
          // Get an array of tracks for each video
  var t=v[iVideos].getElementsByTagName("track");
        // Look through those tracks
  for (var iTracks=0;iTracks<t.length;iTracks++) {
            // If the kind is "Captions" or "Subtitles"
  if (t[iTracks].getAttribute("kind").toLowerCase()=="captions" || t[iTracks].getAttribute("kind").toLowerCase()=="subtitles") {
                // Add an object to the tracks array for this video object
                  var iNewPos = v[iVideos].tracks.push({
  "label": t[iTracks].getAttribute("label"),
"kind": t[iTracks].getAttribute("kind"),
"src": t[iTracks].getAttribute("src"),
"srclang": t[iTracks].getAttribute("srclang"),
                    "element": v[iVideos]
  });
                // Set up an async web request to fetch the text of the SRT file
  var xmlhttp=new XMLHttpRequest();
                if (!xmlhttp) 
  xmlhttp = new ActiveXObject("Microsoft.XMLHTTP");
                // Store a reference to where we are, so we can work with it without having to wander through the page again
  xmlhttp.track = v[iVideos].tracks[iNewPos];
xmlhttp.onreadystatechanged = new function() {
                    // If we have a valid response
  if (this.readyState == 4 && this.status == 200) {
                        // Store the text, and fire off an SRT parser
                          this.track.webSRTText=responseText;
                          parseWebSRT(this.track);
  }
}
                // Fire off the request asynchronously
  xmlhttp.open("GET",t[iTracks].getAttribute("src"),true);
}
}
        // Set up a container for the subtitles
  var oSubtitleDiv = new document.createElement("div");
        oSubtitleDiv.style.position = "relative";
          oSubtitleDiv.style.bottom = "0px";
          oSubtitleDiv.style.left = "0px";
          oSubtitleDiv.style.right = "0px";
          oSubtitleDiv.cssClass = "cue";
  v[iVideos].appendChild(oSubtitleDiv);
v[iVideos].subtitleDiv = oSubtitleDiv;
v[iVideos].displaying = -1;
        // Set a function to update the subtitle container
          // Hopefully this doesn't fire TOO often, might want to only let this run every
          // x cycles or so, to ensure we don't bog the system down
  v[iVideos].timeupdate = new function() {
if (this.trackSelected) {
var track = this.tracks[this.trackSelected];
                if (track.cues) {
  for (var iSRT=0;iSRT<track.cues.length;iSRT++) {
                        if (v[iVideos].currentTime > track.cues[iSRT].start && v[iVideos].currentTime < track.cues[iSRT].end) {
                              if (v[iVideos].displaying != iSRT) {
                                  // Remove the currently displaying contents
  this.subtitleDiv.innerHTML = "";
                                // Add the lines in this SRT file
  for (var sLine in track.cues[iSRT].lines) {
var oLine = new document.createElement("p");
oLine.appendChild(document.createTextNode(sLine));
                                    this.subtitleDiv.appendChild(oLine);
  }
this.subtitleDiv.cssClass = "cue cue" + track.cues[iSRT].cuenumber;
v[iVideos].displaying = iSRT;
}
}
}
}
}
}
}
}
function parseWebSRT(track) {
      // Set up a variable to contain the text, with normalised line endings
  var text = track.webSRTText.replace(/(\r\n|\r|\n)/g, '\n');
    // Split the source into SRT blocks
  var aSRTParts = text.split("\n\n");
    var webSRT = [];
      // For each SRT part
  for (sPart in aSRTParts) {
        // Split it into lines
  var aSRTLines = sPart.split("\n");
        // Separate out the lines
  var aTimes = aSRTLines[1].split(" --> ");
        // Start time is the first defined
          var sStart = aTimes[0];
          // End time is the second defined. There may be some junk after it, separated by a space, so drop the rest
          // WARNING: This may barf, not tested
  var sEnd = aTimes[1].split(" ",2)[0];
        // Parse the times
          var iStart = parseSRTTime(sStart);
          var iEnd = parseSRTTime(sEnd);
          // Add the cue to the array
          var iNewPos = webSRT.push({
              "cuenumber": parseInt(aSRTLines[0]),
              "start": iStart,
              "end": iEnd,
              "lines": []
  })
        // Add the text lines to the newly-created cue object's lines property
  for (var i=2;i<aSRTLines.length;i++) {
webSRT[iNewPos].lines.push(aSRTLines[i]);
}
}
    // Set the cues variable
  track.cues = webSRT;
}
function parseSRTTime(sTime) {
      // Split the time into parts, separated by colons
      aTimeParts = sTime.split(":");
      // The last part is the time in seconds, followed by a comma, followed by the time in ms
      // Easy way to sort that is to replace the , with a . and parse it as a float
  var iSecs = parseFloat(aTimeParts[aTimeParts.length - 1].replace(",","."));
    var iPosition = 60;
      // Loop through the parts BACKWARDS
      for (i=aTimeParts.length-2;i>0;i--) {
          // Increment the number of seconds, dependant on the position
  iSecs += parseInt(aTimeParts[i]) * iPosition;
        // Moving to the next highest denomenator, multiply by 60.
          // As the SRT only lists up to hours, this is a safe assumption to make
  iPosition = iPosition * 60;
}
    // return the number of seconds total
      return iSecs;
  }