src/websites/youtube.rb

   1 #
   2 # Copyright Michael Orlitzky
   3 #
   4 # http://michael.orlitzky.com/
   5 #
   6 # This program is free software: you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 # GNU General Public License for more details.
  15 #
  16 # http://www.fsf.org/licensing/licenses/gpl.html
  17 #
  18
  19 require 'src/website'
  20 require 'cgi'
  21
  22 class Youtube < Website
  23
  24   VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i
  25
  26   def self.owns_url?(url)
  27     return url =~ VALID_YOUTUBE_URL_REGEX
  28   end
  29
  30
  31   def initialize(url)
  32     super
  33
  34     # The @format variable just caches the format of the video we're
  35     # downloading. Storing it will prevent us from having to calculate
  36     # it twice.
  37     @format = 0
  38   end
  39
  40
  41   def get_video_url()
  42     video_id = self.parse_video_id()
  43
  44     # The video's URL (the "page data" URL) may be different from the
  45     # URL that was passed to the program. We support the /v/video_id
  46     # URL format, but that is *not* the main video page where we can
  47     # retrieve the "t" parameter. We can only get that from the
  48     # /watch?v=video_id form.
  49     page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
  50     page_data = self.get_page_data(page_data_url)
  51
  52     # Magic.
  53     t_parameter = self.parse_t_parameter(page_data)
  54
  55     video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}"
  56
  57     # Figure out which formats are available, and if any are,
  58     # choose the best one.
  59     available_formats = get_available_formats(page_data)
  60     desired_format = get_desired_format(available_formats)
  61
  62     if not desired_format.nil?
  63       # First we cache the format so that when we're asked for the
  64       # video filename later, we don't have to recompute the format.
  65       @format = desired_format
  66
  67       # And then stick the format parameter on the end of the URL.
  68       video_url = video_url + "&fmt=#{desired_format}"
  69     end
  70
  71     return video_url
  72   end
  73
  74
  75   def get_video_filename()
  76     # The format -> extension mapping is available on Wikipedia:
  77     #
  78     #   http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
  79     #
  80     # The default extension is .flv.
  81     extension = '.flv'
  82
  83     if [18, 22, 35, 37].include?(@format)
  84       extension = '.mp4'
  85     elsif (@format == 17)
  86       extension = '.3gp'
  87     end
  88
  89     return (self.parse_video_id() + extension)
  90   end
  91
  92
  93   protected;
  94
  95   # Get the video id from the URL. Should be relatively easy,
  96   # unless Youtube supports some URL formats of which I'm unaware.
  97   def parse_video_id()
  98     # Both URLs are fairly easy to parse if you handle
  99     # them one at a time. The only tricky situation is when
 100     # parameters like "&hl=en" are tacked on to the end.
 101     # We'll call /watch?v=video_id the "first form."
 102     first_form_video_id_regex = /v=([0-9a-z_\-]+)/i
 103     first_form_matches = first_form_video_id_regex.match(@url)
 104     return first_form_matches[1] if not (first_form_matches.nil? ||
 105                                          first_form_matches.length < 2)
 106
 107     # First form didn't work? Try the second.
 108     second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i
 109     second_form_matches = second_form_video_id_regex.match(@url)
 110     return second_form_matches[1] if not (second_form_matches.nil? ||
 111                                           second_form_matches.length < 2)
 112
 113     # ...and the third.
 114     third_form_video_id_regex = /\/([[:alnum:]]+)$/i
 115     third_form_matches = third_form_video_id_regex.match(@url)
 116     return third_form_matches[1] if not (third_form_matches.nil? ||
 117                                          third_form_matches.length < 2)
 118
 119     # If we made it here, we couldn't figure out the video id. Yes,
 120     # this is fatal, since we don't know where the video file is
 121     # located.
 122     raise StandardError.new("Could not parse the video id.")
 123   end
 124
 125
 126   # Parse out the "t" parameter from the video's page. I'm not sure
 127   # what "t" stands for, but it's located in some JSON, and is required
 128   # for the final video URL to work.
 129   def parse_t_parameter(page_data)
 130     t_parameter = nil
 131
 132     t_parameter_regex = /\"t\"\:[[:space:]]\"([^\"]+?)\"/
 133     matches = t_parameter_regex.match(page_data)
 134     t_parameter = matches[1] if not (matches.nil? || matches.length < 2)
 135
 136     return t_parameter
 137   end
 138
 139
 140   def get_available_formats(page_data)
 141     # Parse the list of available formats from the "fmt_list" Flash
 142     # variable.
 143     available_formats = []
 144     fmt_list_regex = /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/
 145     matches = fmt_list_regex.match(page_data)
 146
 147     if matches.nil?
 148       raise StandardError.new("Could not find any valid formats.")
 149     end
 150
 151     fmts_string = CGI::unescape(matches[1])
 152     fmts_string.split(',').each do |fmt|
 153       # Each "fmt" will look something like,
 154       #
 155       #   35/640000/9/0/115
 156       #
 157       # with the format identifier coming before the first slash.
 158       first_slash_idx = fmt.index('/')
 159       available_formats << fmt[0...first_slash_idx].to_i
 160     end
 161
 162     return available_formats
 163   end
 164
 165
 166   def get_desired_format(available_formats)
 167     # Check for the presence of formats, in order of preference
 168     # (quality). That is, we check for the best formats first. As soon
 169     # as a format is found to be available, we return it as the
 170     # desired format, since the first format we find is going to be
 171     # the best available format.
 172     return 37 if available_formats.include?(37)
 173     return 22 if available_formats.include?(22)
 174     return 35 if available_formats.include?(35)
 175     return 18 if available_formats.include?(18)
 176     return 34 if available_formats.include?(34)
 177     return 17 if available_formats.include?(17)
 178   end
 179
 180 end