Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 68 additions & 6 deletions tencentcloud/common/abstract_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
from tencentcloud.common.profile.client_profile import ClientProfile, RegionBreakerProfile
from tencentcloud.common.sign import Sign
from tencentcloud.common.circuit_breaker import CircuitBreaker
from tencentcloud.common.domain_failover import (
DomainFailoverManager, _classify_exception, is_failover_triggered,
)
from tencentcloud.common.retry import NoopRetryer

warnings.filterwarnings("ignore", module="tencentcloud", category=UserWarning)
Expand Down Expand Up @@ -89,6 +92,10 @@ def __init__(self, credential, region, profile=None):
if self.profile.region_breaker_profile is None:
self.profile.region_breaker_profile = RegionBreakerProfile()
self.circuit_breaker = CircuitBreaker(self.profile.region_breaker_profile)

# 域名级容灾管理器(SDK 内部机制,对用户完全透明,.com → .com.cn → .cn)
self.domain_failover = DomainFailoverManager()

if self.profile.request_client:
self.request_client = self._sdkVersion + "; " + self.profile.request_client
else:
Expand Down Expand Up @@ -427,16 +434,71 @@ def _call(self, action, params, options=None, headers=None):
headers["X-TC-TraceId"] = str(uuid.uuid4())
if not self.profile.disable_region_breaker:
return self._call_with_region_breaker(action, params, options, headers)
req = RequestInternal(self._get_endpoint(options=options),
self.profile.httpProfile.reqMethod,
self._requestPath,
header=headers)
self._build_req_inter(action, params, req, options)

# apigw_endpoint 由用户显式指定,跳过域名切换
if self.profile.httpProfile.apigw_endpoint:
req = RequestInternal(self._get_endpoint(options=options),
self.profile.httpProfile.reqMethod,
self._requestPath,
header=headers)
self._build_req_inter(action, params, req, options)
req.host = self.profile.httpProfile.apigw_endpoint
req.header["Host"] = req.host
return self.request.send_request(req)
return self.request.send_request(req)

origin_endpoint = self._get_endpoint(options=options)
return self._call_with_domain_failover(origin_endpoint, action, params, options, headers)

def _call_with_domain_failover(self, origin_endpoint, action, params, options, headers):
"""按候选域名顺序串行尝试,首次可切换异常即切到下一个候选。

每个候选都携带独立的断路器;任何一次成功都会重置对应候选的失败计数。
全部候选失败,抛出最后一次的 TencentCloudSDKException(异常链保留)。
"""
usable = self.domain_failover.iter_available_candidates(origin_endpoint)
last_err = None

for idx, (cand_host, breaker, generation) in enumerate(usable):
# 每个候选都需要重新构造 req 并重新签名(因为 Host 变了,TC3 签名里
# `host:` 也要跟着变)。注意 headers 是外部传入的字典,为避免签名残留
# 污染下个候选,这里深拷贝一份。
cand_headers = dict(headers)
req = RequestInternal(cand_host,
self.profile.httpProfile.reqMethod,
self._requestPath,
header=cand_headers)
self._build_req_inter(action, params, req, options)
# 覆写 Host,确保即便老签名版本 (HmacSHA1/256) 没设 Host 也能生效
req.header["Host"] = cand_host

# ProxyConnection.request_host 会在请求时作为 setdefault("Host") 的兜底;
# 为确保 HTTP 层也看到正确的 Host,这里一并同步(不影响 rootDomain 配置)。
prev_request_host = self.request.conn.request_host
self.request.conn.request_host = cand_host
try:
resp = self.request.send_request(req)
breaker.after_requests(generation, True)
return resp
except TencentCloudSDKException as e:
kind = _classify_exception(e)
if is_failover_triggered(kind):
# 触发切换:反馈失败并尝试下一个候选
breaker.after_requests(generation, False)
last_err = e
logger.debug(
"domain_failover: candidate=%s kind=%s err=%s, try next",
cand_host, kind, e)
continue
# 非网络类异常:不切换,直接抛;不影响断路器计数(避免业务错误污染)
raise
finally:
self.request.conn.request_host = prev_request_host

# 全部候选失败:抛出最后一次的异常(异常链已经通过 `raise ... from e` 保留)
if last_err is not None:
raise last_err
# 理论上走不到这里
raise TencentCloudSDKException("ClientNetworkError", "all failover candidates failed")

def call(self, action, params, options=None, headers=None):

Expand Down
229 changes: 229 additions & 0 deletions tencentcloud/common/domain_failover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
# -*- coding: utf-8 -*-
#
# Copyright 2017-2026 Tencent Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
"""
域名级容灾切换模块。

当 SDK 发起的请求命中 DNS/TCP/TLS 类故障(详见 `tests/dns_failure_test/
DNS_FAILURE_SDK_EXCEPTION_ANALYSIS.md`)时,本模块按"主域名 → .com.cn →
.cn"的顺序串行重试,并为每个候选域名维护一个独立的 CircuitBreaker。

规则:
- *.tencentcloudapi.com -> *.tencentcloudapi.com.cn -> *.tencentcloudapi.cn
- *.{region}.tencentcloudapi.com -> *.{region}.tencentcloudapi.com.cn -> *.{region}.tencentcloudapi.cn
- *.internal.tencentcloudapi.com -> 按通用规则切换
- *.intl.tencentcloudapi.com -> 不切换(国际站)
"""
import json
import logging
import socket
import threading

try:
import ssl as _ssl
except ImportError: # pragma: no cover
_ssl = None

from tencentcloud.common.circuit_breaker import CircuitBreaker
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException

logger = logging.getLogger("tencentcloud_sdk_common")

# 主域名根 → 备份候选根(按优先级排列)
_FAILOVER_SUFFIX_RULES = [
("tencentcloudapi.com", ["tencentcloudapi.com.cn", "tencentcloudapi.cn"]),
]

# 国际站域名后缀:严格匹配,不做切换
_INTL_SUFFIX = ".intl.tencentcloudapi.com"


class _InternalBreakerSetting(object):
"""域名容灾用的断路器阈值(完全内部常量,不暴露给用户)。

字段名与 RegionBreakerProfile 保持一致,以便复用已有的 CircuitBreaker 实现。
每个候选域名的 CircuitBreaker 持有独立的 setting 实例,避免相互影响。
"""

def __init__(self):
self.max_fail_num = 5
self.max_fail_percent = 0.75
self.window_interval = 60 * 5 # 5 分钟内累计窗口
self.timeout = 60 # OPEN 状态 60s 后进入 HALF_OPEN
self.max_requests = 5 # HALF_OPEN 下累计 5 次成功后回到 CLOSED


def _classify_exception(exc):
"""沿 __cause__ / __context__ 链识别原始异常类型,返回可触发域名切换的 kind。

返回值:
- "DNS_NXDOMAIN" / "DNS_TIMEOUT" -> A 类 DNS 故障
- "TCP_CONN_REFUSED" -> B 类 连接被拒
- "TCP_READ_TIMEOUT" -> B 类 读超时
- "TLS_ERROR" -> C 类 证书错误
- "JSON_DECODE_ERROR" -> C 类 JSON 解析失败(不切换)
- None -> 非网络类异常(不切换)
"""
# 业务方法层的 JSONDecodeError 包装
if isinstance(exc, TencentCloudSDKException) and exc.get_code() == "JSONDecodeError":
return "JSON_DECODE_ERROR"

# 沿异常链找到原始异常
raw = None
if isinstance(exc, TencentCloudSDKException):
raw = exc.__cause__ or exc.__context__
else:
raw = exc
if raw is None:
return None

# 走到链末端
root = raw
seen = set()
while True:
nxt = getattr(root, "__cause__", None) or getattr(root, "__context__", None)
if nxt is None or id(nxt) in seen:
break
seen.add(id(root))
root = nxt

# 延迟导入 requests,避免影响未使用 http 的调用路径
try:
import requests
req_conn_err = requests.exceptions.ConnectionError
req_read_timeout = requests.exceptions.ReadTimeout
req_connect_timeout = requests.exceptions.ConnectTimeout
req_ssl_error = requests.exceptions.SSLError
except ImportError: # pragma: no cover
req_conn_err = req_read_timeout = req_connect_timeout = req_ssl_error = ()

# TLS 错误
if req_ssl_error and isinstance(raw, req_ssl_error):
return "TLS_ERROR"
if _ssl is not None and isinstance(root, _ssl.SSLError):
return "TLS_ERROR"

# 读超时
if req_read_timeout and isinstance(raw, req_read_timeout):
return "TCP_READ_TIMEOUT"
if isinstance(root, socket.timeout):
return "TCP_READ_TIMEOUT"

# 连接超时
if req_connect_timeout and isinstance(raw, req_connect_timeout):
return "TCP_READ_TIMEOUT"

# 连接被拒(包括 DNS 返回 0.0.0.0 / 被劫持到无服务 IP)
if isinstance(root, ConnectionRefusedError):
return "TCP_CONN_REFUSED"

# DNS 解析失败
if isinstance(root, socket.gaierror):
errno = getattr(root, "errno", None)
# EAI_AGAIN = -3 on glibc, 11002 on Windows → 多为 DNS 超时
if errno in (socket.EAI_AGAIN, -3, 11002):
return "DNS_TIMEOUT"
return "DNS_NXDOMAIN"

# 其他 ConnectionError(兜底也触发切换,避免漏判)
if req_conn_err and isinstance(raw, req_conn_err):
return "DNS_NXDOMAIN"

return None


def is_failover_triggered(kind):
"""kind 是否触发域名切换。JSON_DECODE_ERROR 和 None 均不触发。"""
return kind in ("DNS_NXDOMAIN", "DNS_TIMEOUT",
"TCP_CONN_REFUSED", "TCP_READ_TIMEOUT", "TLS_ERROR")


def _split_host_suffix(host):
"""将 host 按 "tencentcloudapi.com" 等已知后缀拆分为 (prefix, matched_suffix)。
若未命中任何受支持后缀则返回 (None, None)。
"""
if not host:
return None, None
for suffix, _ in _FAILOVER_SUFFIX_RULES:
if host == suffix or host.endswith("." + suffix):
prefix = host[: -len(suffix)] # 含结尾的 '.'(或空串)
return prefix, suffix
return None, None


def build_candidates(host):
"""根据原始 host 构造候选域名序列,首项始终是 host 自身。

若 host 命中 `*.intl.tencentcloudapi.com`,则返回 `[host]`(不切换)。
若 host 未命中任何受支持后缀(比如用户自定义 endpoint / ip),也返回 `[host]`。
"""
if not host:
return [host]

# 国际站不切换
if host == _INTL_SUFFIX.lstrip(".") or host.endswith(_INTL_SUFFIX):
return [host]

prefix, suffix = _split_host_suffix(host)
if suffix is None:
return [host]

candidates = [host]
for alt in dict(_FAILOVER_SUFFIX_RULES)[suffix]:
candidates.append(prefix + alt)
return candidates


class DomainFailoverManager(object):
"""按候选域名维度维护断路器的容器。

生命周期:AbstractClient 持有一个实例;每个候选域名首次出现时动态
创建 CircuitBreaker。不同 client 实例间不共享(与现有 region_breaker
的作用域一致)。

本管理器为 SDK 内部组件,对用户完全透明:不暴露开关、不暴露阈值,
始终生效。仅当 host 未命中 `*.tencentcloudapi.com` 族(例如 intl 域名、
自定义 endpoint、IP)时等价于"不切换",此时行为与改造前完全一致。
"""

def __init__(self):
self._breakers = {}
self._lock = threading.Lock()

def get_breaker(self, host):
with self._lock:
br = self._breakers.get(host)
if br is None:
br = CircuitBreaker(_InternalBreakerSetting())
self._breakers[host] = br
return br

def iter_available_candidates(self, host):
"""按顺序返回 (candidate_host, breaker, generation)。

- 若断路器为 OPEN,则跳过该候选;若全部 OPEN,则降级为"仍然尝试主域名"
以避免流量全部被拒(与现有 region_breaker 行为一致)。
- 调用方负责调用 breaker.after_requests(generation, success) 回写结果。
"""
candidates = build_candidates(host)
usable = []
for c in candidates:
br = self.get_breaker(c)
generation, need_skip = br.before_requests()
if need_skip:
logger.debug("domain_failover: skip %s (breaker open)", c)
continue
usable.append((c, br, generation))

if not usable:
# 全部断路器都 OPEN,这种情况也要给一次机会,选择主域名
br = self.get_breaker(candidates[0])
generation, _ = br.before_requests()
usable.append((candidates[0], br, generation))
return usable
11 changes: 10 additions & 1 deletion tencentcloud/common/http/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,17 @@ def send_request(self, req_inter):
http_resp = self._request(req_inter)
self.request_size = self.conn.request_length
return http_resp
except TencentCloudSDKException:
# 已经是 SDK 异常(例如 _request 里抛出的 ClientParamsError),原样抛出,避免二次包装
raise
except Exception as e:
raise TencentCloudSDKException("ClientNetworkError", str(e))
# 保留原有的 ClientNetworkError 外壳以兼容重试机制(StandardRetryer 依赖该错误码),
# 同时使用 PEP 3134 标准的 `raise ... from e` 建立异常链,上层可通过
# e.__cause__ 直接拿到原始异常(如 requests.exceptions.ConnectionError /
# ReadTimeout / SSLError 等),并沿 __cause__ 继续追到末端 socket.gaierror、
# ConnectionRefusedError、socket.timeout、CertificateError 等,从而对
# DNS/网络故障做精细化容灾判断。
raise TencentCloudSDKException("ClientNetworkError", str(e)) from e


class RequestInternal(object):
Expand Down
1 change: 0 additions & 1 deletion tencentcloud/common/profile/client_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def __init__(self, signMethod=None, httpProfile=None, language="zh-CN",

self.retryer = retryer


class RegionBreakerProfile(object):
"""RegionBreaker profile.

Expand Down
Loading