Frederick Zhang c27eed4606
Curl: Fix following redirections when base changes
Update base URL when there is an absolute location, so that following
relative locations are considered relative to the new base.

Consider below cURL output for https://example_one.com:

    HTTP/1.1 302 Moved Temporarily
    Location: https://example_two.com

    HTTP/1.1 302 Moved Temporarily
    Location: /foo/

    HTTP/1.1 200 OK

The final URL should be https://example_two.com/foo/ rather than
https://example_one.com/foo/.
2022-11-30 01:32:24 +11:00

581 lines
21 KiB
Ruby

# typed: false
# frozen_string_literal: true
require "open3"
require "extend/time"
module Utils
# Helper function for interacting with `curl`.
#
# @api private
module Curl
extend T::Sig
using TimeRemaining
# This regex is used to extract the part of an ETag within quotation marks,
# ignoring any leading weak validator indicator (`W/`). This simplifies
# ETag comparison in `#curl_check_http_content`.
ETAG_VALUE_REGEX = %r{^(?:[wW]/)?"((?:[^"]|\\")*)"}.freeze
# HTTP responses and body content are typically separated by a double
# `CRLF` (whereas HTTP header lines are separated by a single `CRLF`).
# In rare cases, this can also be a double newline (`\n\n`).
HTTP_RESPONSE_BODY_SEPARATOR = "\r\n\r\n"
# This regex is used to isolate the parts of an HTTP status line, namely
# the status code and any following descriptive text (e.g., `Not Found`).
HTTP_STATUS_LINE_REGEX = %r{^HTTP/.* (?<code>\d+)(?: (?<text>[^\r\n]+))?}.freeze
private_constant :ETAG_VALUE_REGEX, :HTTP_RESPONSE_BODY_SEPARATOR, :HTTP_STATUS_LINE_REGEX
module_function
def curl_executable(use_homebrew_curl: false)
return HOMEBREW_BREWED_CURL_PATH if use_homebrew_curl
@curl_executable ||= HOMEBREW_SHIMS_PATH/"shared/curl"
end
def curl_path
@curl_path ||= Utils.popen_read(curl_executable, "--homebrew=print-path").chomp.presence
end
def clear_path_cache
@curl_path = nil
end
sig {
params(
extra_args: T.untyped,
connect_timeout: T.any(Integer, Float, NilClass),
max_time: T.any(Integer, Float, NilClass),
retries: T.nilable(Integer),
retry_max_time: T.any(Integer, Float, NilClass),
show_output: T.nilable(T::Boolean),
user_agent: T.any(String, Symbol, NilClass),
).returns(T::Array[T.untyped])
}
def curl_args(
*extra_args,
connect_timeout: nil,
max_time: nil,
retries: Homebrew::EnvConfig.curl_retries.to_i,
retry_max_time: nil,
show_output: false,
user_agent: nil
)
args = []
# do not load .curlrc unless requested (must be the first argument)
args << "--disable" unless Homebrew::EnvConfig.curlrc?
# echo any cookies received on a redirect
args << "--cookie" << "/dev/null"
args << "--globoff"
args << "--show-error"
args << "--user-agent" << case user_agent
when :browser, :fake
HOMEBREW_USER_AGENT_FAKE_SAFARI
when :default, nil
HOMEBREW_USER_AGENT_CURL
when String
user_agent
else
raise TypeError, ":user_agent must be :browser/:fake, :default, or a String"
end
args << "--header" << "Accept-Language: en"
unless show_output == true
args << "--fail"
args << "--progress-bar" unless Context.current.verbose?
args << "--verbose" if Homebrew::EnvConfig.curl_verbose?
args << "--silent" unless $stdout.tty?
end
args << "--connect-timeout" << connect_timeout.round(3) if connect_timeout.present?
args << "--max-time" << max_time.round(3) if max_time.present?
# A non-positive integer (e.g., 0) or `nil` will omit this argument
args << "--retry" << retries if retries&.positive?
args << "--retry-max-time" << retry_max_time.round if retry_max_time.present?
args + extra_args
end
def curl_with_workarounds(
*args,
secrets: nil, print_stdout: nil, print_stderr: nil, debug: nil,
verbose: nil, env: {}, timeout: nil, use_homebrew_curl: false, **options
)
end_time = Time.now + timeout if timeout
command_options = {
secrets: secrets,
print_stdout: print_stdout,
print_stderr: print_stderr,
debug: debug,
verbose: verbose,
}.compact
result = system_command curl_executable(use_homebrew_curl: use_homebrew_curl),
args: curl_args(*args, **options),
env: env,
timeout: end_time&.remaining,
**command_options
return result if result.success? || !args.exclude?("--http1.1")
raise Timeout::Error, result.stderr.lines.last.chomp if timeout && result.status.exitstatus == 28
# Error in the HTTP2 framing layer
if result.status.exitstatus == 16
return curl_with_workarounds(
*args, "--http1.1",
timeout: end_time&.remaining, **command_options, **options
)
end
# This is a workaround for https://github.com/curl/curl/issues/1618.
if result.status.exitstatus == 56 # Unexpected EOF
out = curl_output("-V").stdout
# If `curl` doesn't support HTTP2, the exception is unrelated to this bug.
return result unless out.include?("HTTP2")
# The bug is fixed in `curl` >= 7.60.0.
curl_version = out[/curl (\d+(\.\d+)+)/, 1]
return result if Gem::Version.new(curl_version) >= Gem::Version.new("7.60.0")
return curl_with_workarounds(*args, "--http1.1", **command_options, **options)
end
result
end
def curl(*args, print_stdout: true, **options)
result = curl_with_workarounds(*args, print_stdout: print_stdout, **options)
result.assert_success!
result
end
def curl_download(*args, to: nil, try_partial: false, **options)
destination = Pathname(to)
destination.dirname.mkpath
if try_partial
range_stdout = curl_output("--location", "--head", *args, **options).stdout
parsed_output = parse_curl_output(range_stdout)
headers = if parsed_output[:responses].present?
parsed_output[:responses].last[:headers]
else
{}
end
# Any value for `accept-ranges` other than none indicates that the server supports partial requests.
# Its absence indicates no support.
supports_partial = headers.key?("accept-ranges") && headers["accept-ranges"] != "none"
if supports_partial &&
destination.exist? &&
destination.size == headers["content-length"].to_i
return # We've already downloaded all the bytes
end
end
args = ["--location", "--remote-time", "--output", destination, *args]
# continue-at shouldn't be used with servers that don't support partial requests.
args = ["--continue-at", "-", *args] if destination.exist? && supports_partial
curl(*args, **options)
end
def curl_output(*args, **options)
curl_with_workarounds(*args, print_stderr: false, show_output: true, **options)
end
# Check if a URL is protected by CloudFlare (e.g. badlion.net and jaxx.io).
# @param response [Hash] A response hash from `#parse_curl_response`.
# @return [true, false] Whether a response contains headers indicating that
# the URL is protected by Cloudflare.
sig { params(response: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
def url_protected_by_cloudflare?(response)
return false if response[:headers].blank?
return false unless [403, 503].include?(response[:status_code].to_i)
set_cookie_header = Array(response[:headers]["set-cookie"])
has_cloudflare_cookie_header = set_cookie_header.compact.any? do |cookie|
cookie.match?(/^(__cfduid|__cf_bm)=/i)
end
server_header = Array(response[:headers]["server"])
has_cloudflare_server = server_header.compact.any? do |server|
server.match?(/^cloudflare/i)
end
has_cloudflare_cookie_header && has_cloudflare_server
end
# Check if a URL is protected by Incapsula (e.g. corsair.com).
# @param response [Hash] A response hash from `#parse_curl_response`.
# @return [true, false] Whether a response contains headers indicating that
# the URL is protected by Incapsula.
sig { params(response: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
def url_protected_by_incapsula?(response)
return false if response[:headers].blank?
return false if response[:status_code].to_i != 403
set_cookie_header = Array(response[:headers]["set-cookie"])
set_cookie_header.compact.any? { |cookie| cookie.match?(/^(visid_incap|incap_ses)_/i) }
end
def curl_check_http_content(url, url_type, specs: {}, user_agents: [:default],
check_content: false, strict: false, use_homebrew_curl: false)
return unless url.start_with? "http"
secure_url = url.sub(/\Ahttp:/, "https:")
secure_details = nil
hash_needed = false
if url != secure_url
user_agents.each do |user_agent|
secure_details = begin
curl_http_content_headers_and_checksum(
secure_url,
specs: specs,
hash_needed: true,
use_homebrew_curl: use_homebrew_curl,
user_agent: user_agent,
)
rescue Timeout::Error
next
end
next unless http_status_ok?(secure_details[:status_code])
hash_needed = true
user_agents = [user_agent]
break
end
end
details = nil
user_agents.each do |user_agent|
details =
curl_http_content_headers_and_checksum(
url,
specs: specs,
hash_needed: hash_needed,
use_homebrew_curl: use_homebrew_curl,
user_agent: user_agent,
)
break if http_status_ok?(details[:status_code])
end
unless details[:status_code]
# Hack around https://github.com/Homebrew/brew/issues/3199
return if MacOS.version == :el_capitan
return "The #{url_type} #{url} is not reachable"
end
unless http_status_ok?(details[:status_code])
return if details[:responses].any? do |response|
url_protected_by_cloudflare?(response) || url_protected_by_incapsula?(response)
end
# https://github.com/Homebrew/brew/issues/13789
# If the `:homepage` of a formula is private, it will fail an `audit`
# since there's no way to specify a `strategy` with `using:` and
# GitHub does not authorize access to the web UI using token
#
# Strategy:
# If the `:homepage` 404s, it's a GitHub link, and we have a token then
# check the API (which does use tokens) for the repository
repo_details = url.match(%r{https?://github\.com/(?<user>[^/]+)/(?<repo>[^/]+)/?.*})
check_github_api = url_type == SharedAudits::URL_TYPE_HOMEPAGE &&
details[:status_code] == "404" &&
repo_details &&
Homebrew::EnvConfig.github_api_token
unless check_github_api
return "The #{url_type} #{url} is not reachable (HTTP status code #{details[:status_code]})"
end
"Unable to find homepage" if SharedAudits.github_repo_data(repo_details[:user], repo_details[:repo]).nil?
end
if url.start_with?("https://") && Homebrew::EnvConfig.no_insecure_redirect? &&
(details[:final_url].present? && !details[:final_url].start_with?("https://"))
return "The #{url_type} #{url} redirects back to HTTP"
end
return unless secure_details
return if !http_status_ok?(details[:status_code]) || !http_status_ok?(secure_details[:status_code])
etag_match = details[:etag] &&
details[:etag] == secure_details[:etag]
content_length_match =
details[:content_length] &&
details[:content_length] == secure_details[:content_length]
file_match = details[:file_hash] == secure_details[:file_hash]
http_with_https_available =
url.start_with?("http://") &&
(secure_details[:final_url].present? && secure_details[:final_url].start_with?("https://"))
if (etag_match || content_length_match || file_match) && http_with_https_available
return "The #{url_type} #{url} should use HTTPS rather than HTTP"
end
return unless check_content
no_protocol_file_contents = %r{https?:\\?/\\?/}
http_content = details[:file]&.scrub&.gsub(no_protocol_file_contents, "/")
https_content = secure_details[:file]&.scrub&.gsub(no_protocol_file_contents, "/")
# Check for the same content after removing all protocols
if (http_content && https_content) && (http_content == https_content) && http_with_https_available
return "The #{url_type} #{url} should use HTTPS rather than HTTP"
end
return unless strict
# Same size, different content after normalization
# (typical causes: Generated ID, Timestamp, Unix time)
if http_content.length == https_content.length
return "The #{url_type} #{url} may be able to use HTTPS rather than HTTP. Please verify it in a browser."
end
lenratio = (100 * https_content.length / http_content.length).to_i
return unless (90..110).cover?(lenratio)
"The #{url_type} #{url} may be able to use HTTPS rather than HTTP. Please verify it in a browser."
end
def curl_http_content_headers_and_checksum(
url, specs: {}, hash_needed: false,
use_homebrew_curl: false, user_agent: :default
)
file = Tempfile.new.tap(&:close)
# Convert specs to options. This is mostly key-value options,
# unless the value is a boolean in which case treat as as flag.
specs = specs.flat_map do |option, argument|
next [] if argument == false # No flag.
args = ["--#{option.to_s.tr("_", "-")}"]
args << argument unless argument == true # It's a flag.
args
end
max_time = hash_needed ? 600 : 25
output, _, status = curl_output(
*specs, "--dump-header", "-", "--output", file.path, "--location", url,
use_homebrew_curl: use_homebrew_curl,
connect_timeout: 15,
max_time: max_time,
retry_max_time: max_time,
user_agent: user_agent
)
parsed_output = parse_curl_output(output)
responses = parsed_output[:responses]
final_url = curl_response_last_location(responses)
headers = if responses.last.present?
status_code = responses.last[:status_code]
responses.last[:headers]
else
{}
end
etag = headers["etag"][ETAG_VALUE_REGEX, 1] if headers["etag"].present?
content_length = headers["content-length"]
if status.success?
open_args = {}
# Try to get encoding from Content-Type header
# TODO: add guessing encoding by <meta http-equiv="Content-Type" ...> tag
if (content_type = headers["content-type"]) &&
(match = content_type.match(/;\s*charset\s*=\s*([^\s]+)/)) &&
(charset = match[1])
begin
open_args[:encoding] = Encoding.find(charset)
rescue ArgumentError
# Unknown charset in Content-Type header
end
end
file_contents = File.read(file.path, **open_args)
file_hash = Digest::SHA2.hexdigest(file_contents) if hash_needed
end
{
url: url,
final_url: final_url,
status_code: status_code,
headers: headers,
etag: etag,
content_length: content_length,
file: file_contents,
file_hash: file_hash,
responses: responses,
}
ensure
file.unlink
end
def curl_supports_tls13?
@curl_supports_tls13 ||= Hash.new do |h, key|
h[key] = quiet_system(curl_executable, "--tlsv1.3", "--head", "https://brew.sh/")
end
@curl_supports_tls13[curl_path]
end
def http_status_ok?(status)
(100..299).cover?(status.to_i)
end
# Separates the output text from `curl` into an array of HTTP responses and
# the final response body (i.e. content). Response hashes contain the
# `:status_code`, `:status_text`, and `:headers`.
# @param output [String] The output text from `curl` containing HTTP
# responses, body content, or both.
# @param max_iterations [Integer] The maximum number of iterations for the
# `while` loop that parses HTTP response text. This should correspond to
# the maximum number of requests in the output. If `curl`'s `--max-redirs`
# option is used, `max_iterations` should be `max-redirs + 1`, to
# account for any final response after the redirections.
# @return [Hash] A hash containing an array of response hashes and the body
# content, if found.
sig { params(output: String, max_iterations: Integer).returns(T::Hash[Symbol, T.untyped]) }
def parse_curl_output(output, max_iterations: 25)
responses = []
iterations = 0
output = output.lstrip
while output.match?(%r{\AHTTP/[\d.]+ \d+}) && output.include?(HTTP_RESPONSE_BODY_SEPARATOR)
iterations += 1
raise "Too many redirects (max = #{max_iterations})" if iterations > max_iterations
response_text, _, output = output.partition(HTTP_RESPONSE_BODY_SEPARATOR)
output = output.lstrip
next if response_text.blank?
response_text.chomp!
response = parse_curl_response(response_text)
responses << response if response.present?
end
{ responses: responses, body: output }
end
# Returns the URL from the last location header found in cURL responses,
# if any.
# @param responses [Array<Hash>] An array of hashes containing response
# status information and headers from `#parse_curl_response`.
# @param absolutize [true, false] Whether to make the location URL absolute.
# @param base_url [String, nil] The URL to use as a base for making the
# `location` URL absolute.
# @return [String, nil] The URL from the last-occurring `location` header
# in the responses or `nil` (if no `location` headers found).
sig {
params(
responses: T::Array[T::Hash[Symbol, T.untyped]],
absolutize: T::Boolean,
base_url: T.nilable(String),
).returns(T.nilable(String))
}
def curl_response_last_location(responses, absolutize: false, base_url: nil)
responses.reverse_each do |response|
next if response[:headers].blank?
location = response[:headers]["location"]
next if location.blank?
absolute_url = URI.join(base_url, location).to_s if absolutize && base_url.present?
return absolute_url || location
end
nil
end
# Returns the final URL by following location headers in cURL responses.
# @param responses [Array<Hash>] An array of hashes containing response
# status information and headers from `#parse_curl_response`.
# @param base_url [String] The URL to use as a base.
# @return [String] The final absolute URL after redirections.
sig {
params(
responses: T::Array[T::Hash[Symbol, T.untyped]],
base_url: String,
).returns(String)
}
def curl_response_follow_redirections(responses, base_url)
responses.each do |response|
next if response[:headers].blank?
location = response[:headers]["location"]
next if location.blank?
base_url = URI.join(base_url, location).to_s
end
base_url
end
private
# Parses HTTP response text from `curl` output into a hash containing the
# information from the status line (status code and, optionally,
# descriptive text) and headers.
# @param response_text [String] The text of a `curl` response, consisting
# of a status line followed by header lines.
# @return [Hash] A hash containing the response status information and
# headers (as a hash with header names as keys).
sig { params(response_text: String).returns(T::Hash[Symbol, T.untyped]) }
def parse_curl_response(response_text)
response = {}
return response unless response_text.match?(HTTP_STATUS_LINE_REGEX)
# Parse the status line and remove it
match = response_text.match(HTTP_STATUS_LINE_REGEX)
response[:status_code] = match["code"] if match["code"].present?
response[:status_text] = match["text"] if match["text"].present?
response_text = response_text.sub(%r{^HTTP/.* (\d+).*$\s*}, "")
# Create a hash from the header lines
response[:headers] = {}
response_text.split("\r\n").each do |line|
header_name, header_value = line.split(/:\s*/, 2)
next if header_name.blank?
header_name = header_name.strip.downcase
header_value&.strip!
case response[:headers][header_name]
when nil
response[:headers][header_name] = header_value
when String
response[:headers][header_name] = [response[:headers][header_name], header_value]
when Array
response[:headers][header_name].push(header_value)
end
response[:headers][header_name]
end
response
end
end
end
# FIXME: Include `Utils::Curl` explicitly everywhere it is used.
include Utils::Curl # rubocop:disable Style/MixinUsage