src/websites/youtube.rb

   1 #
   2 # Copyright Michael Orlitzky
   3 #
   4 # http://michael.orlitzky.com/
   5 #
   6 # This program is free software: you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 # GNU General Public License for more details.
  15 #
  16 # http://www.fsf.org/licensing/licenses/gpl.html
  17 #
  18
  19 require 'src/website'
  20
  21 # Needed to download the page, which is in turn
  22 # needed because it contains the video URL.
  23 require 'net/http'
  24 require 'uri'
  25
  26
  27 class Youtube < Website
  28
  29   VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?(www\.)?youtube\.com\/((watch\?v=)|(v\/))[[:alnum:]]+(\&.*)?\#?$/
  30
  31   def self.owns_url?(url)
  32     return url =~ VALID_YOUTUBE_URL_REGEX
  33   end
  34
  35
  36   def get_video_url(url)
  37     video_id = self.parse_video_id(url)
  38
  39     # The video's URL (the "page data" URL) may be different from the
  40     # URL that was passed to the program. We support the /v/video_id
  41     # URL format, but that is *not* the main video page where we can
  42     # retrieve the "t" parameter. We can only get that from the
  43     # /watch?v=video_id form.
  44     page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
  45     page_data = self.get_page_data(page_data_url)
  46
  47     # Magic.
  48     t_parameter = self.parse_t_parameter(page_data)
  49
  50     video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}"
  51
  52     return video_url
  53   end
  54
  55
  56   protected;
  57
  58   # Get the video id from the URL. Should be relatively easy,
  59   # unless Youtube supports some URL formats of which I'm unaware.
  60   def parse_video_id(url)
  61     # Return nil if we get no matches below.
  62     video_id = nil
  63
  64     # Both URLs are fairly easy to parse if you handle
  65     # them one at a time. The only tricky situation is when
  66     # parameters like "&hl=en" are tacked on to the end.
  67     # We'll call /watch?v=video_id the "first form."
  68     first_form_video_id_regex = /v=([[:alnum:]]+)$/
  69     first_form_matches = first_form_video_id_regex.match(url)
  70     return first_form_matches[1] if not (first_form_matches.nil? ||
  71                                          first_form_matches.length < 2)
  72
  73     # First form didn't work? Try the second.
  74     second_form_video_id_regex = /\/v\/([[:alnum:]]+)/
  75     second_form_matches = second_form_video_id_regex.match(url)
  76     video_id = second_form_matches[1] if not (second_form_matches.nil? ||
  77                                               second_form_matches.length < 2)
  78
  79     return video_id
  80   end
  81
  82
  83   # Parse out the "t" parameter from the video's page. I'm not sure
  84   # what "t" stands for, but it's located in some JSON, and is required
  85   # for the final video URL to work.
  86   def parse_t_parameter(page_data)
  87     t_parameter = nil
  88
  89     t_parameter_regex = /\"t\"\:[[:space:]]\"([^\"]+?)\"/
  90     matches = t_parameter_regex.match(page_data)
  91     t_parameter = matches[1] if not (matches.nil? || matches.length < 2)
  92
  93     return t_parameter
  94   end
  95
  96
  97   def get_page_data(url)
  98     uri = URI.parse(url)
  99
 100     response = Net::HTTP.start(uri.host, uri.port) do |http|
 101       http.get(uri.request_uri)
 102     end
 103
 104     return response.body
 105   end
 106
 107 end