On this page
class URI::RFC2396_Parser
Class that parses String's into URI's.
It contains a Hash set of patterns and Regexp's that match and validate.
Attributes
The Hash of patterns.
See also URI::Parser.initialize_pattern.
See also URI::Parser.initialize_regexp.
Public Class Methods
# File lib/uri/rfc2396_parser.rb, line 99
def initialize(opts = {})
  @pattern = initialize_pattern(opts)
  @pattern.each_value(&:freeze)
  @pattern.freeze
  @regexp = initialize_regexp(@pattern)
  @regexp.each_value(&:freeze)
  @regexp.freeze
end
      Synopsis
URI::Parser.new([opts])
      Args
The constructor accepts a hash as options for parser. Keys of options are pattern names of URI components and values of options are pattern strings. The constructor generates set of regexps for parsing URIs.
You can use the following keys:
* :ESCAPED (URI::PATTERN::ESCAPED in default)
* :UNRESERVED (URI::PATTERN::UNRESERVED in default)
* :DOMLABEL (URI::PATTERN::DOMLABEL in default)
* :TOPLABEL (URI::PATTERN::TOPLABEL in default)
* :HOSTNAME (URI::PATTERN::HOSTNAME in default)
      Examples
p = URI::Parser.new(:ESCAPED => "(?:%[a-fA-F0-9]{2}|%u[a-fA-F0-9]{4})")
u = p.parse("http://example.jp/%uABCD") #=> #<URI::HTTP http://example.jp/%uABCD>
URI.parse(u.to_s) #=> raises URI::InvalidURIError
s = "http://example.com/ABCD"
u1 = p.parse(s) #=> #<URI::HTTP http://example.com/ABCD>
u2 = URI.parse(s) #=> #<URI::HTTP http://example.com/ABCD>
u1 == u2 #=> true
u1.eql?(u2) #=> false
     Public Instance Methods
# File lib/uri/rfc2396_parser.rb, line 287
def escape(str, unsafe = @regexp[:UNSAFE])
  unless unsafe.kind_of?(Regexp)
    # perhaps unsafe is String object
    unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false)
  end
  str.gsub(unsafe) do
    us = $&
    tmp = ''
    us.each_byte do |uc|
      tmp << sprintf('%%%02X', uc)
    end
    tmp
  end.force_encoding(Encoding::US_ASCII)
end
      Args
str- 
        
Stringto make safe unsafe- 
        
Regexpto apply. Defaults to self.regexp 
Description
Constructs a safe String from str, removing unsafe characters, replacing them with codes.
# File lib/uri/rfc2396_parser.rb, line 249
def extract(str, schemes = nil)
  if block_given?
    str.scan(make_regexp(schemes)) { yield $& }
    nil
  else
    result = []
    str.scan(make_regexp(schemes)) { result.push $& }
    result
  end
end
      Args
str- 
        
Stringto search schemes- 
        
Patterns to apply to
str 
Description
Attempts to parse and merge a set of URIs. If no block given, then returns the result, else it calls block for each element in result.
See also URI::Parser.make_regexp.
# File lib/uri/rfc2396_parser.rb, line 325
def inspect
  @@to_s.bind_call(self)
end
      # File lib/uri/rfc2396_parser.rb, line 223
def join(*uris)
  uris[0] = convert_to_uri(uris[0])
  uris.inject :merge
end
      Args
uris- 
        
an
Arrayof Strings 
Description
Attempts to parse and merge a set of URIs.
# File lib/uri/rfc2396_parser.rb, line 262
def make_regexp(schemes = nil)
  unless schemes
    @regexp[:ABS_URI_REF]
  else
    /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x
  end
end
      Returns Regexp that is default self.regexp, unless schemes is provided. Then it is a Regexp.union with self.pattern.
# File lib/uri/rfc2396_parser.rb, line 209
def parse(uri)
  URI.for(*self.split(uri), self)
end
      Args
uri
Description
Parses uri and constructs either matching URI scheme object (File, FTP, HTTP, HTTPS, LDAP, LDAPS, or MailTo) or URI::Generic.
Usage
p = URI::Parser.new
p.parse("ldap://ldap.example.com/dc=example?user=john")
#=> #<URI::LDAP ldap://ldap.example.com/dc=example?user=john>
     # File lib/uri/rfc2396_parser.rb, line 120
def split(uri)
  case uri
  when ''
    # null uri
  when @regexp[:ABS_URI]
    scheme, opaque, userinfo, host, port,
      registry, path, query, fragment = $~[1..-1]
    # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
    # absoluteURI   = scheme ":" ( hier_part | opaque_part )
    # hier_part     = ( net_path | abs_path ) [ "?" query ]
    # opaque_part   = uric_no_slash *uric
    # abs_path      = "/"  path_segments
    # net_path      = "//" authority [ abs_path ]
    # authority     = server | reg_name
    # server        = [ [ userinfo "@" ] hostport ]
    if !scheme
      raise InvalidURIError,
        "bad URI(absolute but no scheme): #{uri}"
    end
    if !opaque && (!path && (!host && !registry))
      raise InvalidURIError,
        "bad URI(absolute but no path): #{uri}"
    end
  when @regexp[:REL_URI]
    scheme = nil
    opaque = nil
    userinfo, host, port, registry,
      rel_segment, abs_path, query, fragment = $~[1..-1]
    if rel_segment && abs_path
      path = rel_segment + abs_path
    elsif rel_segment
      path = rel_segment
    elsif abs_path
      path = abs_path
    end
    # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
    # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
    # net_path      = "//" authority [ abs_path ]
    # abs_path      = "/"  path_segments
    # rel_path      = rel_segment [ abs_path ]
    # authority     = server | reg_name
    # server        = [ [ userinfo "@" ] hostport ]
  else
    raise InvalidURIError, "bad URI(is not URI?): #{uri}"
  end
  path = '' if !path && !opaque # (see RFC2396 Section 5.2)
  ret = [
    scheme,
    userinfo, host, port,         # X
    registry,                     # X
    path,                         # Y
    opaque,                       # Y
    query,
    fragment
  ]
  return ret
end
      # File lib/uri/rfc2396_parser.rb, line 318
def unescape(str, escaped = @regexp[:ESCAPED])
  enc = str.encoding
  enc = Encoding::UTF_8 if enc == Encoding::US_ASCII
  str.gsub(escaped) { [$&[1, 2]].pack('H2').force_encoding(enc) }
end
      Args
str- 
        
Stringto remove escapes from escaped- 
        
Regexpto apply. Defaults to self.regexp 
Description
Removes escapes from str.
Private Instance Methods
# File lib/uri/rfc2396_parser.rb, line 521
def convert_to_uri(uri)
  if uri.is_a?(URI::Generic)
    uri
  elsif uri = String.try_convert(uri)
    parse(uri)
  else
    raise ArgumentError,
      "bad argument (expected URI object or URI string)"
  end
end
      # File lib/uri/rfc2396_parser.rb, line 332
def initialize_pattern(opts = {})
  ret = {}
  ret[:ESCAPED] = escaped = (opts.delete(:ESCAPED) || PATTERN::ESCAPED)
  ret[:UNRESERVED] = unreserved = opts.delete(:UNRESERVED) || PATTERN::UNRESERVED
  ret[:RESERVED] = reserved = opts.delete(:RESERVED) || PATTERN::RESERVED
  ret[:DOMLABEL] = opts.delete(:DOMLABEL) || PATTERN::DOMLABEL
  ret[:TOPLABEL] = opts.delete(:TOPLABEL) || PATTERN::TOPLABEL
  ret[:HOSTNAME] = hostname = opts.delete(:HOSTNAME)
  # RFC 2396 (URI Generic Syntax)
  # RFC 2732 (IPv6 Literal Addresses in URL's)
  # RFC 2373 (IPv6 Addressing Architecture)
  # uric          = reserved | unreserved | escaped
  ret[:URIC] = uric = "(?:[#{unreserved}#{reserved}]|#{escaped})"
  # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
  #                 "&" | "=" | "+" | "$" | ","
  ret[:URIC_NO_SLASH] = uric_no_slash = "(?:[#{unreserved};?:@&=+$,]|#{escaped})"
  # query         = *uric
  ret[:QUERY] = query = "#{uric}*"
  # fragment      = *uric
  ret[:FRAGMENT] = fragment = "#{uric}*"
  # hostname      = *( domainlabel "." ) toplabel [ "." ]
  # reg-name      = *( unreserved / pct-encoded / sub-delims ) # RFC3986
  unless hostname
    ret[:HOSTNAME] = hostname = "(?:[a-zA-Z0-9\\-.]|%\\h\\h)+"
  end
  # RFC 2373, APPENDIX B:
  # IPv6address = hexpart [ ":" IPv4address ]
  # IPv4address   = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
  # hexseq  = hex4 *( ":" hex4)
  # hex4    = 1*4HEXDIG
  #
  # XXX: This definition has a flaw. "::" + IPv4address must be
  # allowed too.  Here is a replacement.
  #
  # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
  # hex4     = 1*4HEXDIG
  hex4 = "[#{PATTERN::HEX}]{1,4}"
  # lastpart = hex4 | IPv4address
  lastpart = "(?:#{hex4}|#{ipv4addr})"
  # hexseq1  = *( hex4 ":" ) hex4
  hexseq1 = "(?:#{hex4}:)*#{hex4}"
  # hexseq2  = *( hex4 ":" ) lastpart
  hexseq2 = "(?:#{hex4}:)*#{lastpart}"
  # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
  ret[:IPV6ADDR] = ipv6addr = "(?:#{hexseq2}|(?:#{hexseq1})?::(?:#{hexseq2})?)"
  # IPv6prefix  = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
  # unused
  # ipv6reference = "[" IPv6address "]" (RFC 2732)
  ret[:IPV6REF] = ipv6ref = "\\[#{ipv6addr}\\]"
  # host          = hostname | IPv4address
  # host          = hostname | IPv4address | IPv6reference (RFC 2732)
  ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})"
  # port          = *digit
  ret[:PORT] = port = '\d*'
  # hostport      = host [ ":" port ]
  ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?"
  # userinfo      = *( unreserved | escaped |
  #                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
  ret[:USERINFO] = userinfo = "(?:[#{unreserved};:&=+$,]|#{escaped})*"
  # pchar         = unreserved | escaped |
  #                 ":" | "@" | "&" | "=" | "+" | "$" | ","
  pchar = "(?:[#{unreserved}:@&=+$,]|#{escaped})"
  # param         = *pchar
  param = "#{pchar}*"
  # segment       = *pchar *( ";" param )
  segment = "#{pchar}*(?:;#{param})*"
  # path_segments = segment *( "/" segment )
  ret[:PATH_SEGMENTS] = path_segments = "#{segment}(?:/#{segment})*"
  # server        = [ [ userinfo "@" ] hostport ]
  server = "(?:#{userinfo}@)?#{hostport}"
  # reg_name      = 1*( unreserved | escaped | "$" | "," |
  #                     ";" | ":" | "@" | "&" | "=" | "+" )
  ret[:REG_NAME] = reg_name = "(?:[#{unreserved}$,;:@&=+]|#{escaped})+"
  # authority     = server | reg_name
  authority = "(?:#{server}|#{reg_name})"
  # rel_segment   = 1*( unreserved | escaped |
  #                     ";" | "@" | "&" | "=" | "+" | "$" | "," )
  ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+"
  # scheme        = alpha *( alpha | digit | "+" | "-" | "." )
  ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][\\-+.#{PATTERN::ALPHA}\\d]*"
  # abs_path      = "/"  path_segments
  ret[:ABS_PATH] = abs_path = "/#{path_segments}"
  # rel_path      = rel_segment [ abs_path ]
  ret[:REL_PATH] = rel_path = "#{rel_segment}(?:#{abs_path})?"
  # net_path      = "//" authority [ abs_path ]
  ret[:NET_PATH] = net_path = "//#{authority}(?:#{abs_path})?"
  # hier_part     = ( net_path | abs_path ) [ "?" query ]
  ret[:HIER_PART] = hier_part = "(?:#{net_path}|#{abs_path})(?:\\?(?:#{query}))?"
  # opaque_part   = uric_no_slash *uric
  ret[:OPAQUE_PART] = opaque_part = "#{uric_no_slash}#{uric}*"
  # absoluteURI   = scheme ":" ( hier_part | opaque_part )
  ret[:ABS_URI] = abs_uri = "#{scheme}:(?:#{hier_part}|#{opaque_part})"
  # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
  ret[:REL_URI] = rel_uri = "(?:#{net_path}|#{abs_path}|#{rel_path})(?:\\?#{query})?"
  # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  ret[:URI_REF] = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?"
  ret[:X_ABS_URI] = "
    (#{scheme}):                           (?# 1: scheme)
    (?:
       (#{opaque_part})                    (?# 2: opaque)
    |
       (?:(?:
         //(?:
             (?:(?:(#{userinfo})@)?        (?# 3: userinfo)
               (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port)
           |
             (#{reg_name})                 (?# 6: registry)
           )
         |
         (?!//))                           (?# XXX: '//' is the mark for hostport)
         (#{abs_path})?                    (?# 7: path)
       )(?:\\?(#{query}))?                 (?# 8: query)
    )
    (?:\\#(#{fragment}))?                  (?# 9: fragment)
  "
  ret[:X_REL_URI] = "
    (?:
      (?:
        //
        (?:
          (?:(#{userinfo})@)?       (?# 1: userinfo)
            (#{host})?(?::(\\d*))?  (?# 2: host, 3: port)
        |
          (#{reg_name})             (?# 4: registry)
        )
      )
    |
      (#{rel_segment})              (?# 5: rel_segment)
    )?
    (#{abs_path})?                  (?# 6: abs_path)
    (?:\\?(#{query}))?              (?# 7: query)
    (?:\\#(#{fragment}))?           (?# 8: fragment)
  "
  ret
end
      Constructs the default Hash of patterns.
# File lib/uri/rfc2396_parser.rb, line 490
def initialize_regexp(pattern)
  ret = {}
  # for URI::split
  ret[:ABS_URI] = Regexp.new('\A\s*' + pattern[:X_ABS_URI] + '\s*\z', Regexp::EXTENDED)
  ret[:REL_URI] = Regexp.new('\A\s*' + pattern[:X_REL_URI] + '\s*\z', Regexp::EXTENDED)
  # for URI::extract
  ret[:URI_REF]     = Regexp.new(pattern[:URI_REF])
  ret[:ABS_URI_REF] = Regexp.new(pattern[:X_ABS_URI], Regexp::EXTENDED)
  ret[:REL_URI_REF] = Regexp.new(pattern[:X_REL_URI], Regexp::EXTENDED)
  # for URI::escape/unescape
  ret[:ESCAPED] = Regexp.new(pattern[:ESCAPED])
  ret[:UNSAFE]  = Regexp.new("[^#{pattern[:UNRESERVED]}#{pattern[:RESERVED]}]")
  # for Generic#initialize
  ret[:SCHEME]   = Regexp.new("\\A#{pattern[:SCHEME]}\\z")
  ret[:USERINFO] = Regexp.new("\\A#{pattern[:USERINFO]}\\z")
  ret[:HOST]     = Regexp.new("\\A#{pattern[:HOST]}\\z")
  ret[:PORT]     = Regexp.new("\\A#{pattern[:PORT]}\\z")
  ret[:OPAQUE]   = Regexp.new("\\A#{pattern[:OPAQUE_PART]}\\z")
  ret[:REGISTRY] = Regexp.new("\\A#{pattern[:REG_NAME]}\\z")
  ret[:ABS_PATH] = Regexp.new("\\A#{pattern[:ABS_PATH]}\\z")
  ret[:REL_PATH] = Regexp.new("\\A#{pattern[:REL_PATH]}\\z")
  ret[:QUERY]    = Regexp.new("\\A#{pattern[:QUERY]}\\z")
  ret[:FRAGMENT] = Regexp.new("\\A#{pattern[:FRAGMENT]}\\z")
  ret
end
      Constructs the default Hash of Regexp's.
Ruby Core © 1993–2020 Yukihiro Matsumoto
Licensed under the Ruby License.
Ruby Standard Library © contributors
Licensed under their own licenses.