diff --git a/src/components/HashRedirect.jsx b/src/components/HashRedirect.jsx new file mode 100644 index 00000000..4d76b97c --- /dev/null +++ b/src/components/HashRedirect.jsx @@ -0,0 +1,24 @@ +import { useEffect } from 'react' +import { useRouter } from 'next/router' + +/** + * Client-side redirect for fragment links to sections that have moved to other + * pages. `next.config` redirects can't act on the URL hash (it never reaches the + * server), so this catches old deep links like + * `/selfhosted/troubleshooting#debugging-turn-connections` on mount and forwards + * them to the new location. + * + * @param {Record} map - old anchor id -> new path (optionally with #anchor) + */ +export function HashRedirect({ map = {} }) { + const router = useRouter() + useEffect(() => { + const id = window.location.hash.replace(/^#/, '') + if (id && map[id]) { + router.replace(map[id]) + } + // run once on mount + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []) + return null +} diff --git a/src/components/NavigationDocs.jsx b/src/components/NavigationDocs.jsx index b9d94f50..11e5d3a4 100644 --- a/src/components/NavigationDocs.jsx +++ b/src/components/NavigationDocs.jsx @@ -736,7 +736,28 @@ export const docsNavigation = [ ], }, { title: 'Advanced Guide', href: '/selfhosted/selfhosted-guide' }, - { title: 'Troubleshooting', href: '/selfhosted/troubleshooting' }, + { + title: 'Troubleshooting', + isOpen: false, + links: [ + { title: 'Overview', href: '/selfhosted/troubleshooting' }, + { title: 'Installation', href: '/selfhosted/troubleshooting/installation' }, + { + title: 'Embedded IdP', + href: '/selfhosted/troubleshooting/identity-provider', + }, + { title: 'Dashboard', href: '/selfhosted/troubleshooting/dashboard' }, + { + title: 'Certificates', + href: '/selfhosted/troubleshooting/certificates', + }, + { + title: 'Connectivity', + href: '/selfhosted/troubleshooting/connectivity', + }, + { title: 'Database', href: '/selfhosted/troubleshooting/database' }, + ], + }, { title: 'Migration Guides', isOpen: false, @@ -856,14 +877,117 @@ export const docsNavigation = [ title: 'GET MORE HELP', links: [ { - title: 'Troubleshooting client issues', - href: '/help/troubleshooting-client', - }, - { - title: 'Troubleshooting relayed connections', - href: '/help/troubleshooting-relayed-connections', + title: 'Troubleshooting', + isOpen: false, + links: [ + { title: 'Overview', href: '/help/troubleshooting' }, + { + title: 'NetBird Client', + isOpen: false, + links: [ + { title: 'Overview', href: '/help/troubleshooting-client' }, + { title: 'Linux', href: '/help/troubleshooting-client/linux' }, + { title: 'Windows', href: '/help/troubleshooting-client/windows' }, + { title: 'macOS', href: '/help/troubleshooting-client/macos' }, + { title: 'Android', href: '/help/troubleshooting-client/android' }, + { title: 'iOS', href: '/help/troubleshooting-client/ios' }, + ], + }, + { + title: 'Self-hosted', + isOpen: false, + links: [ + { title: 'Overview', href: '/selfhosted/troubleshooting' }, + { + title: 'Installation', + href: '/selfhosted/troubleshooting/installation', + }, + { + title: 'Embedded IdP', + href: '/selfhosted/troubleshooting/identity-provider', + }, + { + title: 'Dashboard', + href: '/selfhosted/troubleshooting/dashboard', + }, + { + title: 'Certificates', + href: '/selfhosted/troubleshooting/certificates', + }, + { + title: 'Connectivity', + href: '/selfhosted/troubleshooting/connectivity', + }, + { + title: 'Database', + href: '/selfhosted/troubleshooting/database', + }, + ], + }, + { + title: 'Cloud & identity', + isOpen: false, + links: [ + { + title: 'Pending approval', + href: '/help/troubleshooting-account-access', + }, + { title: 'IdP & SSO setup', href: '/manage/team/single-sign-on' }, + { title: 'User provisioning', href: '/manage/team/idp-sync' }, + { + title: 'Plan limits & quotas', + href: '/manage/settings/plans-and-billing', + }, + ], + }, + { + title: 'Connectivity', + isOpen: false, + links: [ + { + title: 'Relayed Connections', + href: '/help/troubleshooting-relayed-connections', + }, + { + title: 'Resource Connectivity', + href: '/help/troubleshooting-resource-connectivity', + }, + { + title: 'NAT & Connectivity', + href: '/about-netbird/understanding-nat-and-connectivity', + }, + { + title: 'Ports & Firewalls', + href: '/about-netbird/ports-and-firewalls', + }, + ], + }, + { + title: 'Access control', + isOpen: false, + links: [ + { title: 'Policies', href: '/manage/access-control' }, + { + title: 'Posture checks', + href: '/manage/access-control/posture-checks', + }, + { + title: 'Groups', + href: '/manage/access-control#understanding-groups', + }, + ], + }, + { + title: 'Report a bug', + isOpen: false, + links: [ + { title: 'Overview', href: '/help/report-bug-issues' }, + { title: 'Community Support', href: '/help/community-support' }, + { title: 'NetBird Support', href: '/help/netbird-support' }, + ], + }, + ], }, - { title: 'Report bugs and issues', href: '/help/report-bug-issues' }, { title: 'Support Matrix', isOpen: false, diff --git a/src/components/StillStuck.jsx b/src/components/StillStuck.jsx new file mode 100644 index 00000000..5eb9b145 --- /dev/null +++ b/src/components/StillStuck.jsx @@ -0,0 +1,49 @@ +import { Fragment } from 'react' +import { Button } from '@/components/Button' + +/** + * "Still stuck?" call-to-action banner for the bottom of the Troubleshooting + * hub. Renders a short prompt and a row of action buttons. + * + * @param {string} [title='Still stuck?'] + * @param {string} description + * @param {Array<{label: string, href: string, primary?: boolean}>} actions + * @param {string} [separator] - Optional word rendered between buttons (e.g. "or"), + * so two equally-weighted actions don't read as a recommended-vs-secondary pair. + */ +export function StillStuck({ + title = 'Still stuck?', + description, + actions = [], + separator, +}) { + return ( +
+

+ {title} +

+ {description && ( +

+ {description} +

+ )} +
+ {actions.map((action, i) => ( + + {i > 0 && separator && ( + + {separator} + + )} + + + ))} +
+
+ ) +} diff --git a/src/components/SupportBanner.jsx b/src/components/SupportBanner.jsx new file mode 100644 index 00000000..410128f1 --- /dev/null +++ b/src/components/SupportBanner.jsx @@ -0,0 +1,78 @@ +/** + * Audience banner for the "Report bugs and issues" page. Two tones: + * - "community": neutral, for everyone outside the managed Cloud. + * - "support": orange/netbird accent, for paying NetBird Cloud customers. + * + * @param {'community'|'support'} [tone='community'] + * @param {string} [id] - Optional id so the banner can be a scroll target + * @param {string} title + * @param {string} [badge] - Small pill (e.g. "Free", "Cloud customers") + * @param {string} description + * @param {Array<{label: string, href: string}>} [links] - Bullet links to the + * section's channels / anchors. + */ +export function SupportBanner({ + tone = 'community', + id, + title, + badge, + description, + links = [], +}) { + const isSupport = tone === 'support' + const accent = isSupport + ? 'border-[#F28C28]' + : 'border-zinc-300 dark:border-zinc-600' + const badgeClass = isSupport + ? 'bg-[#F28C28]/10 text-[#C2410C] ring-[#F28C28]/30 dark:text-[#FFAC1C] dark:ring-[#F28C28]/40' + : 'bg-zinc-900/5 text-zinc-600 ring-zinc-900/10 dark:bg-white/5 dark:text-zinc-400 dark:ring-white/10' + + return ( +
+ {(title || badge) && ( +
+ {title && ( +

+ {title} +

+ )} + {badge && ( + + {badge} + + )} +
+ )} + {description && ( +

+ {description} +

+ )} + {links.length > 0 && ( + + )} +
+ ) +} diff --git a/src/components/TroubleshootingStart.jsx b/src/components/TroubleshootingStart.jsx new file mode 100644 index 00000000..ef0f6168 --- /dev/null +++ b/src/components/TroubleshootingStart.jsx @@ -0,0 +1,96 @@ +import Link from 'next/link' + +/** + * "Start here" diagnostics block: a highlighted panel with a short intro and a + * row of numbered step cards. Each step can link to a section (href) and/or + * show the relevant command. + * + * @param {string} [eyebrow='Start here'] - Small uppercase label + * @param {string} title - Panel title (e.g. "Collect diagnostics first") + * @param {string} [id] - Optional id for the title anchor + * @param {string} [description] - Intro text below the title + * @param {Array<{label: string, title: string, command?: string, href?: string, hint?: string}>} steps + */ +export function TroubleshootingStart({ + eyebrow = 'Start here', + title, + id, + description, + steps = [], +}) { + return ( +
+
+

+ {eyebrow} +

+ {title && ( +

+ {title} +

+ )} + {description && ( +

+ {description} +

+ )} + + {steps.length > 0 && ( +
+ {steps.map((step) => { + const content = ( + <> +

+ {step.label} +

+

+ {step.title} + {step.href && ( + + )} +

+ {step.command && ( + + {step.command} + + )} + {step.hint && ( +

+ {step.hint} +

+ )} + + ) + + const base = + 'block rounded-xl bg-white p-4 ring-1 ring-inset ring-zinc-900/7.5 dark:bg-white/5 dark:ring-white/10' + + return step.href ? ( + + {content} + + ) : ( +
+ {content} +
+ ) + })} +
+ )} +
+
+ ) +} diff --git a/src/components/TroubleshootingTiles.jsx b/src/components/TroubleshootingTiles.jsx new file mode 100644 index 00000000..8faa345e --- /dev/null +++ b/src/components/TroubleshootingTiles.jsx @@ -0,0 +1,235 @@ +import Link from 'next/link' +import { motion, useMotionTemplate, useMotionValue } from 'framer-motion' + +import { GridPattern } from '@/components/GridPattern' +import { Heading } from '@/components/Heading' + +// Monochrome inline SVG icons. They use `currentColor` so they inherit the +// card's text color and work in both light and dark mode. These are recreated +// here on purpose — we do not reference the bundled asset IDs from the design +// mock, which would not resolve in this project. +const icons = { + laptop: ( + <> + + + + ), + cloud: , + server: ( + <> + + + + + ), + firewall: ( + <> + + + + ), + gear: ( + <> + + + + ), + globe: ( + <> + + + + ), + shield: , + terminal: ( + <> + + + + ), + windows: ( + <> + + + + + + ), + android: ( + <> + + + + + ), + mobile: ( + <> + + + + ), + lock: ( + <> + + + + ), + database: ( + <> + + + + + ), +} + +function TileIcon({ name }) { + return ( + + ) +} + +function TilePattern({ mouseX, mouseY }) { + let maskImage = useMotionTemplate`radial-gradient(180px at ${mouseX}px ${mouseY}px, white, transparent)` + let style = { maskImage, WebkitMaskImage: maskImage } + + return ( +
+
+ +
+ +
+ ) +} + +function TroubleshootingCard({ item }) { + let mouseX = useMotionValue(0) + let mouseY = useMotionValue(0) + + function onMouseMove({ currentTarget, clientX, clientY }) { + let { left, top } = currentTarget.getBoundingClientRect() + mouseX.set(clientX - left) + mouseY.set(clientY - top) + } + + return ( + // Plain container
— NOT an . The title link and each chip are the + // only interactive elements, which keeps the markup free of nested + // interactive (link-inside-link) HTML. A card without a dedicated landing + // page (e.g. NetBird Cloud) simply renders its title as plain text; its + // chips still link to the relevant docs. +
+ +
+
+
+ + + +

+ {item.href ? ( + + {item.title} + + ) : ( + item.title + )} +

+
+ + {item.description && ( +

+ {item.description} +

+ )} + + {item.chips && item.chips.length > 0 && ( +
+ {item.chips.map((chip) => ( + + {chip.label} + + ))} +
+ )} +
+
+ ) +} + +/** + * Troubleshooting hub grid: richer icon + chip cards. + * + * @param {string} [title] - Section heading (e.g. "Find your issue by area") + * @param {string} [id] - Optional id for the heading anchor + * @param {string} [description] - Optional description below the title + * @param {Array<{ + * title: string, + * href?: string, // title links here; omit for a "coming soon" card + * icon: string, // key into the inline icon set + * description?: string, + * chips?: Array<{label: string, href: string}> + * }>} items + */ +export function TroubleshootingTiles({ title, id, description, items }) { + const hasHeader = title || description + + return ( +
+ {title && ( + + {title} + + )} + {description && ( +
+ {description} +
+ )} +
+ {items.map((item) => ( + + ))} +
+
+ ) +} diff --git a/src/pages/help/community-support.mdx b/src/pages/help/community-support.mdx new file mode 100644 index 00000000..ca1a2985 --- /dev/null +++ b/src/pages/help/community-support.mdx @@ -0,0 +1,28 @@ +import { SupportBanner } from "@/components/SupportBanner" + +export const description = + "Community Support for NetBird: Slack and GitHub Discussions for the client, open source self-hosted, and general questions." + +# Community Support + + + +Whichever channel you choose, include a [debug bundle](/help/troubleshooting-client#debug-bundle), your NetBird version (`netbird version`), and clear steps to reproduce. + +## Community Slack + +Best for quick questions or general configuration help. Join the [NetBird community Slack](/slack-url) to talk with other users and the team, and share debug output in a thread. + +## GitHub Discussions + +Best for bug reports, feature requests, and anything worth a searchable, written record. Open a thread in [GitHub Discussions](https://github.com/netbirdio/netbird/discussions) and include your version, environment, and a debug bundle. + +For Cloud dashboard, billing, or commercial-license issues, use [NetBird Support](/help/netbird-support) instead. diff --git a/src/pages/help/netbird-support.mdx b/src/pages/help/netbird-support.mdx new file mode 100644 index 00000000..9320ad89 --- /dev/null +++ b/src/pages/help/netbird-support.mdx @@ -0,0 +1,51 @@ +import { SupportBanner } from "@/components/SupportBanner" + +export const description = + "NetBird Support for (paying) Cloud customers and users, and commercial-license self-hosted deployments: reach the team with a pre-filled report." + +export const reportTemplate = `NetBird issue report +===================== + +1) Describe the problem + A clear, concise description of what is going wrong. + +2) Steps to reproduce + 1. + 2. + 3. + +3) Expected behavior + What you expected to happen instead. + +4) Environment + - NetBird Cloud, or self-hosted control plane: + - NetBird version (run: netbird version): + - Other VPN software installed (if any): + +5) Debug output + - Anonymized status (run: netbird status -d): + - Debug bundle, share the returned file key (run: netbird debug for 1m -S -U): + Uploaded files are auto-deleted after 30 days. + +6) Additional context + Screenshots, logs, or anything else relevant.` + +export const supportMailto = `mailto:support@netbird.io?subject=${encodeURIComponent( + "NetBird issue report" +)}&body=${encodeURIComponent(reportTemplate)}` + +# NetBird Support + + + +Reach the team by opening a
pre-filled support email. It drops a ready-to-fill report into your mail client, with the fields we need already laid out, so you only fill in the blanks. Prefer to write it yourself? Email [support@netbird.io](mailto:support@netbird.io) and include a [debug bundle](/help/troubleshooting-client#debug-bundle), your NetBird version, and clear steps to reproduce. + +Not a Cloud or commercial-license customer? [Community Support](/help/community-support), through Slack and GitHub Discussions, is the right place. diff --git a/src/pages/help/report-bug-issues.mdx b/src/pages/help/report-bug-issues.mdx index 7534bc66..c687d9fe 100644 --- a/src/pages/help/report-bug-issues.mdx +++ b/src/pages/help/report-bug-issues.mdx @@ -1,67 +1,25 @@ -# Report bugs and issues -NetBird offers different ways to report bugs and issues. For prompt and effective assistance, please provide detailed information as outlined in our bug/issue [reporting template](#reporting-template). - -For cloud users, you can report bugs and issues via email by sending an email to [support@netbird.io](mailto:support@netbird.io), via [Github issues](https://github.com/netbirdio/netbird/issues/new/choose) or by joining our [Slack Channel](/slack-url). - -For on-premise users, you can report bugs and issues via [Github issues](https://github.com/netbirdio/netbird/issues/new/choose) or by joining our [Slack Channel](/slack-url). - -## Reporting Template -When reporting bugs and issues, please ensure you provide the following information: - - -**Describe the problem** - -A clear and concise description of what the problem is. - -**To Reproduce** - -Steps to reproduce the behavior: -1. Go to '...' -2. Click on '....' -3. Scroll down to '....' -4. See error - -**Have you performed any debugging steps?** -Learn more at [troubleshooting guide](/help/troubleshooting-client) - -**Expected behavior** - -A clear and concise description of what you expected to happen. - -**Are you using NetBird Cloud?** - -Please specify whether you use NetBird Cloud or self-host NetBird's control plane. - -**NetBird version** +import { Tiles } from "@/components/Tiles" -`netbird version` +export const description = + "How to report NetBird bugs and issues: Community Support for the client and open source self-hosted, and NetBird Support for Cloud customers and users, and commercial-license deployments." -**Is any other VPN software installed?** - -If yes, which one? - -**Debug output** - -To help us resolve the problem, please attach the following anonymized status output - - netbird status -d - -Create and upload a debug bundle, and share the returned file key: - - netbird debug for 1m -S -U - -*Uploaded files are automatically deleted after 30 days.* - - -Alternatively, create the file only and attach it here manually: - - netbird debug for 1m -S - - -**Screenshots** - -If applicable, add screenshots to help explain your problem. - -**Additional context** +# Report bugs and issues -Add any other context about the problem here. +NetBird offers two ways to get help, depending on what you are running. Pick the one that fits. Whichever you use, include a [debug bundle](/help/troubleshooting-client#debug-bundle), your NetBird version (`netbird version`), and clear steps to reproduce. + + diff --git a/src/pages/help/troubleshooting-account-access.mdx b/src/pages/help/troubleshooting-account-access.mdx new file mode 100644 index 00000000..33cd5fa4 --- /dev/null +++ b/src/pages/help/troubleshooting-account-access.mdx @@ -0,0 +1,25 @@ +import {Note} from "@/components/mdx"; + +export const description = "Why your NetBird user might be pending approval, or added to an account you don't recognize, and how to resolve it safely." + +# Pending approval and account invitations + +Signed in and found your user **pending approval**, or landed in a NetBird account you don't recognize? This usually is not a bug. It almost always means you were added to an existing organization automatically, and an admin still has to approve you. + +## Why this happens + +NetBird can add people to an account without a manual, one-by-one invite: + +- **Indirect (domain) invites.** If an account has verified your email domain, new sign-ups with that domain join it automatically. See [Indirect user invites](/manage/team/add-users-to-your-network#indirect-user-invites). +- **Identity provider sync.** If your organization connects NetBird to its identity provider, being added to a synced group in the IdP provisions your user in NetBird. See [Provision users and groups from your identity provider](/manage/team/idp-sync). +- **Approval is required.** When an account turns on [user approval](/manage/team/approve-users#require-user-approval), new users stay blocked until an admin approves them. That block is the "pending approval" state you see. + +## What to do + +1. **Confirm it is your organization.** Check with your IT or security team whether they run NetBird and expect you on it. The account [Owner](/manage/team/user-roles#owner), of which there is exactly one per account, or an admin can confirm and approve you. +2. **Ask an admin to approve your user.** Approval happens in the dashboard from the account's user list. See [Approve or reject user](/manage/team/approve-users#approve-or-reject-user). +3. **If something looks off,** for example you don't recognize the account, your domain was claimed unexpectedly, or no one internally owns it, do not approve or accept anything. Treat it as suspicious and verify internally first. + + +Still unsure who owns the account, or think you were added to the wrong one? Reach out through [NetBird Support](/help/netbird-support) and the team can help confirm the account and its owner. + diff --git a/src/pages/help/troubleshooting-client.mdx b/src/pages/help/troubleshooting-client.mdx index 3f5b2b92..8cc09726 100644 --- a/src/pages/help/troubleshooting-client.mdx +++ b/src/pages/help/troubleshooting-client.mdx @@ -1,12 +1,86 @@ -# Troubleshooting client issues +import { TroubleshootingStart } from "@/components/TroubleshootingStart" +import { TroubleshootingTiles } from "@/components/TroubleshootingTiles" +import { HashRedirect } from "@/components/HashRedirect" -This document offers practical tips and insights to help you debug various problems, ensuring a seamless user -experience. + -## NetBird agent status +# Troubleshooting client issues -The netbird agent is a daemon service that runs in the background; it provides information about peers connected and -about the NetBird control services. You can check the status of the agent with the following command: +This document offers practical tips and insights to help you debug various problems, ensuring a seamless user +experience. The steps here are cross-platform; for OS-specific details, use the platform guides below. + + + + + +## NetBird client status + +The NetBird client is a daemon service that runs in the background; it provides information about peers connected and +about the NetBird control services. You can check the status of the client with the following command: ```shell netbird status --detail @@ -108,7 +182,7 @@ log rotation conflict detected in: "/etc/logrotate.d/netbird", rotation is disab To use `logrotate`, we require `copytruncate` and not `create` to be set in the config file, otherwise the daemon needs to be restarted for the new log file to be opened after rotation. -Netbird supports `compress`, `delaycompress` and `nocompress`. +NetBird supports `compress`, `delaycompress` and `nocompress`. On macOS and BSD systems, the equivalent of `logrotate` is `newsyslog`, configured via `/etc/newsyslog.conf` or files in `/etc/newsyslog.d/`. NetBird does not auto-detect `newsyslog` configurations, so if you want `newsyslog` to manage `client.log` you must set [`NB_LOG_DISABLE_ROTATION=true`](/client/environment-variables#logging) on the daemon to disable the built-in rotation. An example `/etc/newsyslog.d/netbird.conf` entry: @@ -152,6 +226,22 @@ This will output the path of the generated file. The output file is owned by and NetBird is running as, by default it is: `Administrator` on Windows, `root` on MacOS/Linux or the operating system\'s equivalent. +#### What's inside the bundle + +The archive collects the most useful diagnostics into one file, and every bundle ships a `README.txt` that documents each entry. The ones you will reach for most often: + +| File | What it holds | +|---|---| +| `status.txt` | Status output, the same view as `netbird status -d` | +| `client.log`, `netbird.err`, `netbird.out` | Recent client logs, plus stderr and stdout | +| `routes.txt`, `ip_rules.txt` | System routing table and IP rules (with `--system-info`) | +| `iptables.txt`, `nftables.txt`, `ipset.txt` | Firewall rules with packet counters (Linux, with `--system-info`) | +| `resolv.conf`, `scutil_dns.txt`, `resolved_domains.txt` | DNS resolver configuration and the domains NetBird resolved | +| `network_map.json` | Sync response: peers, routes, DNS settings, and firewall rules | +| `config.txt`, `state.json` | Client configuration and internal client state | + +With `--anonymize`, IP addresses, domains, and interface names are replaced consistently across every file, so the bundle stays readable while sensitive values are masked. Private keys and SSH keys are never included, and the packet capture (`capture.pcap`) is left out of anonymized bundles because it holds raw decrypted packets. + ### Debug for a specific time To capture logs for a specific time period, you can use the `debug for` command. This will generate a debug bundle after @@ -298,7 +388,7 @@ On Linux, the userspace packet filter is only active when the kernel firewall ba The client has environment variables for tuning routing, firewall behavior, ICE connectivity, and WireGuard mode. These can help work around edge cases (e.g. `NB_USE_LEGACY_ROUTING` for routing loop issues, `NB_WG_KERNEL_DISABLED` to force userspace WireGuard, `NB_SKIP_NFTABLES_CHECK` to fall back to iptables). See the full list at [Client Environment Variables](/client/environment-variables). -## Enabling debug logs on agent +## Enabling debug logs on the client Logs can be temporarily set using the following command. @@ -316,100 +406,29 @@ The next time the daemon is restarted, the log level will return to the configur Using `netbird down` and `netbird up` will not reset the log level. -To permanently set the log level, see the following sections. +To set the log level **permanently**, follow the steps for your platform: [Linux](/help/troubleshooting-client/linux#set-the-log-level-permanently), [Windows](/help/troubleshooting-client/windows#set-the-log-level-permanently), [macOS](/help/troubleshooting-client/macos#set-the-log-level-permanently), or [Android](/help/troubleshooting-client/android#enable-trace-logs-and-capture-them-with-adb). The default logging level is `info`. To revert back to the original state, you can repeat the procedure with `info` instead of `debug` or `trace`. -### On Linux with systemd - -The default systemd unit file reads a set of environment variables from the path `/etc/sysconfig/netbird`. -You can add the following line to the file to enable debug logs: - -```shell -sudo mkdir -p /etc/sysconfig -echo 'NB_LOG_LEVEL=debug' | sudo tee -a /etc/sysconfig/netbird -sudo systemctl restart netbird -``` +## Running the client in foreground mode -### On Other Linux and MacOS - -```shell -sudo netbird service stop -sudo netbird service uninstall -sudo netbird service install --log-level debug # or trace -sudo netbird service start -``` - -### On Windows - -You need to run the following commands with an elevated PowerShell or `cmd.exe` window. - -```powershell -[Environment]::SetEnvironmentVariable("NB_LOG_LEVEL", "debug", "Machine") -netbird service restart -``` - -### On Docker - -You can set the environment variable `NB_LOG_LEVEL` to `debug` to enable debug logs. - -```shell -docker run --rm --name PEER_NAME --hostname PEER_NAME --cap-add=NET_ADMIN --cap-add=SYS_ADMIN --cap-add=SYS_RESOURCE -d \ --e NB_SETUP_KEY= -e NB_LOG_LEVEL=debug -v netbird-client:/var/lib/netbird netbirdio/netbird:latest -``` - -### On Android - -Enable the ADB in the developer menu on the Android device. -In the app set the the Trace log level setting - it is a checkbox in the advanced menu. -With the ADB tool, you can get the logs from your device. The ADB is part of the SDK platform tools pack (zip file). -You can download it from [here](https://developer.android.com/tools/releases/platform-tools). -Please extract it and run the next command in the case of Linux: - -```shell -sudo adb logcat -v time | grep GoLog -``` - -## Running the agent in foreground mode - -You can run the agent in foreground mode to see the logs in the terminal. This is useful to debugging issues with the -agent. - -### Linux and MacOS +You can run the client in foreground mode to see the logs in the terminal. This is useful when debugging issues with the client. On Linux and macOS: ```shell sudo netbird service stop sudo netbird up -F ``` -### Windows - -On Windows, the agent depends on the Wireguard's `wintun.dll` and can only be executed as a system account. -To run the agent in foreground mode, you need to use a tool -called [PSExec](https://learn.microsoft.com/en-us/sysinternals/downloads/psexec). - -Once you have downloaded and extracted `psexec` open an elevated Powershell window: - -```shell -netbird service stop -.\PsExec64.exe -s cmd.exe /c "netbird up -F --log-level debug > c:\windows\temp\netbird.out.log 2>&1" -``` - -In case you need to configure environment variables, you need to add them as system variables so they get picked up by -the agent on the next psexec run: - -```powershell -[Environment]::SetEnvironmentVariable("PIONS_LOG_DEBUG", "all", "Machine") -```` +On Windows, foreground mode needs PSExec because the client runs as the system account. See [Run the client in foreground mode](/help/troubleshooting-client/windows#run-the-client-in-foreground-mode) on the Windows page. ## Enabling WireGuard in user space Sometimes, you want to test NetBird running on userspace mode instead of a kernel module. That can be a check to see if there is a problem with NetBird's firewall management in kernel mode. -You must run the agent in foreground mode and set the environment variable `NB_WG_KERNEL_DISABLED` to `true`. +You must run the client in foreground mode and set the environment variable `NB_WG_KERNEL_DISABLED` to `true`. ```shell sudo netbird service stop @@ -418,7 +437,7 @@ sudo bash -c 'NB_WG_KERNEL_DISABLED=true netbird up -F' > /tmp/netbird.log ## Debugging GRPC -The NetBird agent communicates with the Management and Signal servers using the GRPC framework. With these parameters, +The NetBird client communicates with the Management and Signal servers using the GRPC framework. With these parameters, you can set verbose logging for this service. @@ -429,7 +448,7 @@ sudo bash -c 'GRPC_GO_LOG_VERBOSITY_LEVEL=99 GRPC_GO_LOG_SEVERITY_LEVEL=info net ## Debugging ICE connections -The Netbird agent communicates with other peers through the Interactive Connectivity Establishment (ICE) protocol +The NetBird client communicates with other peers through the Interactive Connectivity Establishment (ICE) protocol described in the [RFC 8445](https://datatracker.ietf.org/doc/html/rfc8445). To debug the connection procedure, set verbose logging for the the [Pion/ICE](https://github.com/pion/ice) library with the `PIONS_LOG_DEBUG` or `PIONS_LOG_TRACE` variable. @@ -444,10 +463,6 @@ sudo netbird service stop sudo bash -c 'PIONS_LOG_DEBUG=all NB_LOG_LEVEL=debug netbird up -F' > /tmp/netbird.log ``` -## Host-based firewall issues - -NetBird automatically manages host-based firewall rules, but conflicts can occur with other firewall tools or security software. See [Ports & Firewalls — Host-based firewalls](/about-netbird/ports-and-firewalls#host-based-firewalls) for symptoms, platform-specific remediation (UFW, firewalld, Windows Firewall), and diagnostic commands. - ## Client login failures A single machine can only connect to one NetBird account as the same user/login method throughout the lifetime of @@ -490,37 +505,34 @@ Key while the NetBird client daemon is stopped: ## Debugging access to network resources -In this section we will be presenting methodology of troubleshooting access issues involving Netbird. +This section is the hands-on, command-level playbook for the case where a peer is `Connected` but a service behind a routing peer is unreachable. For the conceptual model first (where the traffic stops, and the TCP handshake as the dividing line between a NetBird problem and an application problem), start with [Troubleshooting resource connectivity](/help/troubleshooting-resource-connectivity). This section then walks the same checks with concrete commands. -We will start by presenting a glossary of all machines and services involved. -A sub-section will describe a specific use case. -Each will start with a concise summary of usual troubleshooting steps then expand into more detailed step-by-step -guides. +It uses a glossary of the machines and services involved, followed by a specific use case that opens with a concise summary of the usual steps and expands into a detailed, step-by-step walkthrough. ### Glossary -We will be using the following names for resources outside the Netbird network: +We will be using the following names for resources outside the NetBird network: - `int-net1`: an internal network `10.123.45.0/24`, - `srv-c`: an internal HTTP server running at `10.123.45.17`, - `int-dns1`: an internal DNS server running at `10.123.45.6`, -- `int-dns2`: an internal DNS server nunning at `10.7.8.9`, +- `int-dns2`: an internal DNS server running at `10.7.8.9`, - `cf-dns`: an Internet-accessible CloudFlare DNS server at `1.1.1.1` and `1.0.0.1`, -and following Netbird network resources: +and following NetBird network resources: -- `peer-a`: end user's device running Netbird Client, -- `peer-b`: a linux server inside the internal network running Netbird Client, +- `peer-a`: end user's device running the NetBird client, +- `peer-b`: a Linux server inside the internal network running the NetBird client, - it has direct access to the whole `int-net1` IP range, -- `users:employees`: a Netbird Group containing `peer-a`, -- `routers:int-net1`: a Netbird Group containing `peer-b`, -- `access:srv-c`: a Netbird Groups used as a target of ACL rules for `srv-c` only, -- `access:int-net1`: a Netbird Groups used as a target of ACL rules for the whole subnet, -- `net-a`: a Netbird Network +- `users:employees`: a NetBird Group containing `peer-a`, +- `routers:int-net1`: a NetBird Group containing `peer-b`, +- `access:srv-c`: a NetBird Groups used as a target of ACL rules for `srv-c` only, +- `access:int-net1`: a NetBird Groups used as a target of ACL rules for the whole subnet, +- `net-a`: a NetBird Network - `net-a:srv-c`: a Network Resource handling traffic to `10.123.45.17/32` (`srv-c`), - `net-a:int-net1`: a Network Resource handling traffic to `10.123.45.0/24` (`int-net1`), -- `route:int-net1`: a Netbird Network Route handling traffic to `10.123.45.0/24` (`int-net1`), -- `route:srv-c`: a Netbird Network Route handling traffic to `10.123.45.17/32` (`srv-c`), +- `route:int-net1`: a NetBird Network Route handling traffic to `10.123.45.0/24` (`int-net1`), +- `route:srv-c`: a NetBird Network Route handling traffic to `10.123.45.17/32` (`srv-c`), ### Access from `peer-a` to `srv-c` @@ -528,8 +540,8 @@ In short: 1. Does `peer-b` have direct access to `srv-c`'s port `80`? 2. Can a routing peer `peer-b` forward traffic to `srv-c`? -3. Are Netbird's network routing resources configured? -4. Do Netbird's Access Control rules allow access from `peer-a` to the target's ACL Group? +3. Are NetBird's network routing resources configured? +4. Do NetBird's Access Control rules allow access from `peer-a` to the target's ACL Group? 5. Is `peer-a`'s operating system configured to use the route? Access Control rule is not required for connectivity from `peer-a` to `peer-b` @@ -562,7 +574,7 @@ Linux operating system: net.ipv4.ip_forward = 1 ``` -It should be set up automatically by the Netbird client unless it runs inside a container (which would not be able +It should be set up automatically by the NetBird client unless it runs inside a container (which would not be able to modify `sysctl`), then it requires manual setup. For setting up the value persistently (across reboots) please consult your operating system's documentation. @@ -574,9 +586,9 @@ Testing the functionality in practice involves: - adding a routing table entry to route `int-net1` (`10.123.45.0/24`) traffic through it, - trying to at least `ping 10.123.45.17` (`srv-c`) -#### Are Netbird's network routing resources configured? +#### Are NetBird's network routing resources configured? -For Netbird network routing resources configurations you can use either (new) _Networks_ or (old) _Routes_. +For NetBird network routing resources configurations you can use either (new) _Networks_ or (old) _Routes_. A Network `net-a` should have at minimum: @@ -597,7 +609,7 @@ You can loosen the rules and replace following to grant access to the whole `int - _Address_: `10.123.45.17/32` -> `10.123.45.0/24`, - _Assigned Groups_ / _Access Control Groups_: `access:srv-c` -> `access:int-net1` -#### Do Netbird's Access Control rules allow access from `peer-a` to the target's ACL Group? +#### Do NetBird's Access Control rules allow access from `peer-a` to the target's ACL Group? You can skip this check, when you are using (old) Network Route feature without filling in _Access Control Groups ( optional)_ section. @@ -634,10 +646,10 @@ Just like with the previous section you can loosen the above example by: #### Is `peer-a`'s operating system configured to use the route? -After all resources are configured in the Netbird management you should check whether they are +After all resources are configured in the NetBird management you should check whether they are properly registered with your operating system. -You can start by checking Netbird client's configuration with `netbird status -d` command: +You can start by checking NetBird client's configuration with `netbird status -d` command: ```shell % netbird status -d @@ -732,7 +744,7 @@ your specific subnet's clamped IP ranges (`10.123.45` in case of `int-net1`) and Depending on specifics of your Linux distribution (or even your configuration of it) you should be able to use either `iproute2` or `net-tools` family of network commands. -Netbird client stores it's custom routes in the routing table `7120` (or `0x1BD0`) when it's available (through +NetBird client stores its custom routes in the routing table `7120` (or `0x1BD0`) when it's available (through `iproute2` interface). For `iproute2` (`ip`, `ss` tools): diff --git a/src/pages/help/troubleshooting-client/android.mdx b/src/pages/help/troubleshooting-client/android.mdx new file mode 100644 index 00000000..db317d4f --- /dev/null +++ b/src/pages/help/troubleshooting-client/android.mdx @@ -0,0 +1,16 @@ +export const description = + "Android-specific NetBird client troubleshooting: enabling trace logs and capturing them with ADB." + +# NetBird client on Android + +Android-specific steps for the NetBird client. For everything cross-platform (client status, connectivity, login, and DNS), start from [Troubleshooting client issues](/help/troubleshooting-client). + +## Enable trace logs and capture them with ADB + +1. Enable **ADB** in the device's developer options. +2. In the NetBird app, set the **Trace** log level (a checkbox in the advanced menu). +3. Install the ADB platform tools (part of the [SDK platform-tools](https://developer.android.com/tools/releases/platform-tools) pack), then capture the logs. On Linux: + +```shell +sudo adb logcat -v time | grep GoLog +``` diff --git a/src/pages/help/troubleshooting-client/ios.mdx b/src/pages/help/troubleshooting-client/ios.mdx new file mode 100644 index 00000000..413db9f7 --- /dev/null +++ b/src/pages/help/troubleshooting-client/ios.mdx @@ -0,0 +1,14 @@ +import {Note} from "@/components/mdx" + +export const description = + "iOS-specific NetBird client troubleshooting and how to capture logs from the app." + +# NetBird client on iOS + +iOS-specific steps for the NetBird client. Most troubleshooting is cross-platform, so start from [Troubleshooting client issues](/help/troubleshooting-client). + +On-device debugging on iOS is more limited than on desktop platforms. To share diagnostics, capture logs from the NetBird iOS app and attach them to your report. + + +There are no iOS-specific debug commands yet. For status, connectivity, and DNS issues, follow the cross-platform [Troubleshooting client issues](/help/troubleshooting-client) and [DNS Troubleshooting](/manage/dns/troubleshooting) guides. If you can reproduce a problem, report it through [Community Support](/help/community-support). + diff --git a/src/pages/help/troubleshooting-client/linux.mdx b/src/pages/help/troubleshooting-client/linux.mdx new file mode 100644 index 00000000..ca94035b --- /dev/null +++ b/src/pages/help/troubleshooting-client/linux.mdx @@ -0,0 +1,42 @@ +export const description = + "Linux-specific NetBird client troubleshooting: setting the debug log level (systemd, service, Docker) and host-based firewalls." + +# NetBird client on Linux + +Linux-specific steps for the NetBird client. For everything cross-platform (client status, the debug bundle, GRPC and ICE debugging, login failures, and reaching resources), start from [Troubleshooting client issues](/help/troubleshooting-client). + +## Set the log level permanently + +The [temporary log level](/help/troubleshooting-client#enabling-debug-logs-on-the-client) resets when the daemon restarts. To make it permanent on Linux, use one of the following. + +### systemd + +The default systemd unit reads environment variables from `/etc/sysconfig/netbird`: + +```shell +sudo mkdir -p /etc/sysconfig +echo 'NB_LOG_LEVEL=debug' | sudo tee -a /etc/sysconfig/netbird +sudo systemctl restart netbird +``` + +### Other init systems + +```shell +sudo netbird service stop +sudo netbird service uninstall +sudo netbird service install --log-level debug # or trace +sudo netbird service start +``` + +### Docker + +Set `NB_LOG_LEVEL=debug` on the container: + +```shell +docker run --rm --name PEER_NAME --hostname PEER_NAME --cap-add=NET_ADMIN --cap-add=SYS_ADMIN --cap-add=SYS_RESOURCE -d \ +-e NB_SETUP_KEY= -e NB_LOG_LEVEL=debug -v netbird-client:/var/lib/netbird netbirdio/netbird:latest +``` + +## Host-based firewall + +NetBird manages its own rules, but UFW, firewalld, or endpoint security software can conflict and silently drop traffic. See [Ports & Firewalls: Host-based firewalls](/about-netbird/ports-and-firewalls#host-based-firewalls) for UFW and firewalld symptoms, remediation, and diagnostic commands. diff --git a/src/pages/help/troubleshooting-client/macos.mdx b/src/pages/help/troubleshooting-client/macos.mdx new file mode 100644 index 00000000..c48ad45f --- /dev/null +++ b/src/pages/help/troubleshooting-client/macos.mdx @@ -0,0 +1,21 @@ +export const description = + "macOS-specific NetBird client troubleshooting: setting the debug log level and host-based firewalls." + +# NetBird client on macOS + +macOS-specific steps for the NetBird client. For everything cross-platform (client status, the debug bundle, GRPC and ICE debugging, login failures, and reaching resources), start from [Troubleshooting client issues](/help/troubleshooting-client). + +## Set the log level permanently + +The [temporary log level](/help/troubleshooting-client#enabling-debug-logs-on-the-client) resets when the service restarts. To make it permanent on macOS, reinstall the service with the level set: + +```shell +sudo netbird service stop +sudo netbird service uninstall +sudo netbird service install --log-level debug # or trace +sudo netbird service start +``` + +## Host-based firewall + +The built-in macOS application firewall or third-party endpoint security software can block NetBird traffic before it leaves the machine. If connectivity works with that software temporarily disabled, add an exception for the NetBird process. See [Ports & Firewalls: Host-based firewalls](/about-netbird/ports-and-firewalls#host-based-firewalls) for the general approach. diff --git a/src/pages/help/troubleshooting-client/windows.mdx b/src/pages/help/troubleshooting-client/windows.mdx new file mode 100644 index 00000000..5185322e --- /dev/null +++ b/src/pages/help/troubleshooting-client/windows.mdx @@ -0,0 +1,47 @@ +import {Note} from "@/components/mdx" + +export const description = + "Windows-specific NetBird client troubleshooting: debug log level, foreground mode via PSExec, host-based firewall, and Windows DNS scenarios." + +# NetBird client on Windows + +Windows-specific steps for the NetBird client. For everything cross-platform (client status, the debug bundle, GRPC and ICE debugging, login failures, and reaching resources), start from [Troubleshooting client issues](/help/troubleshooting-client). + +## Set the log level permanently + +The [temporary log level](/help/troubleshooting-client#enabling-debug-logs-on-the-client) resets when the service restarts. To make it permanent, run an elevated PowerShell or `cmd.exe` window: + +```powershell +[Environment]::SetEnvironmentVariable("NB_LOG_LEVEL", "debug", "Machine") +netbird service restart +``` + +## Run the client in foreground mode + +On Windows the client depends on WireGuard's `wintun.dll` and can only run as the system account. To run it in foreground mode, use [PSExec](https://learn.microsoft.com/en-us/sysinternals/downloads/psexec). In an elevated PowerShell window: + +```shell +netbird service stop +.\PsExec64.exe -s cmd.exe /c "netbird up -F --log-level debug > c:\windows\temp\netbird.out.log 2>&1" +``` + +To pass environment variables, set them as machine-level variables so the client picks them up on the next PSExec run: + +```powershell +[Environment]::SetEnvironmentVariable("PIONS_LOG_DEBUG", "all", "Machine") +``` + +## Host-based firewall + +Windows Firewall or endpoint security software can block NetBird traffic before it leaves the machine. See [Ports & Firewalls: Host-based firewalls](/about-netbird/ports-and-firewalls#host-based-firewalls) for Windows Firewall symptoms, remediation, and diagnostic commands. + +## Windows DNS scenarios + +DNS on Windows has a few platform-specific failure modes worth checking separately: + +- **Match-domain names don't resolve, even though the NRPT (Name Resolution Policy Table) rule was written.** A lingering Group Policy `DnsPolicyConfig` container can stop NetBird's rule from taking effect on an off-domain machine. See [DNS Troubleshooting: Issue 8 (lingering GPO)](/manage/dns/troubleshooting#issue-8-windows-nrpt-rule-is-written-but-never-takes-effect-lingering-gpo). +- **Active Directory login, mapped drives, or DFS fail** while a file share by IP works. This is usually a DC-locator (`SRV` record) problem. See [Domain Controllers as routing peers](/manage/dns/internal-dns-servers#domain-controllers-as-routing-peers). + + +For the full DNS diagnostic flow on any platform, see [DNS Troubleshooting](/manage/dns/troubleshooting). + diff --git a/src/pages/help/troubleshooting-relayed-connections.mdx b/src/pages/help/troubleshooting-relayed-connections.mdx index c911fa65..c9c3331a 100644 --- a/src/pages/help/troubleshooting-relayed-connections.mdx +++ b/src/pages/help/troubleshooting-relayed-connections.mdx @@ -5,9 +5,9 @@ export const description = "Learn why a NetBird connection is Relayed instead of # Troubleshooting relayed connections -NetBird always prefers a direct peer-to-peer (P2P) connection and falls back to a relay server when a direct path can't be established. A relayed connection works — it just adds latency and shares the relay's bandwidth, because traffic travels through an intermediary instead of flowing directly between the peers. This page teaches you to find out *why* a connection is relayed, fix it when it's fixable, and recognize the cases where relay is the correct outcome rather than a fault. +NetBird always prefers a direct peer-to-peer (P2P) connection and falls back to a relay server when a direct path can't be established. A relayed connection works. It just adds latency and shares the relay's bandwidth, because traffic travels through an intermediary instead of flowing directly between the peers. This page teaches you to find out *why* a connection is relayed, fix it when it's fixable, and recognize the cases where relay is the correct outcome rather than a fault. -The endpoints on this page are for **NetBird Cloud**. The flow is identical for self-hosted deployments — substitute your own Signal, STUN, and Relay endpoints from the [self-hosted port requirements](/selfhosted/selfhosted-guide#port-requirements). +The endpoints on this page are for **NetBird Cloud**. The flow is identical for self-hosted deployments. Substitute your own Signal, STUN, and Relay endpoints from the [self-hosted port requirements](/selfhosted/selfhosted-guide#port-requirements). A relayed connection is just as secure as a direct one. Traffic is encrypted end to end with WireGuard before it leaves the device, and the relay only forwards packets it cannot read. The trade-off is latency and throughput, never confidentiality. @@ -41,15 +41,15 @@ Find the peer in question and look at the **Connection type** field: | Field | What it tells you | |---|---| | `Connection type: Relayed` | Traffic flows through a relay server instead of directly between the peers | -| `ICE candidate (Local/Remote)` | How each side is connecting — the key diagnostic, explained below | +| `ICE candidate (Local/Remote)` | How each side is connecting, the key diagnostic, explained below | | `Relay server address` | Which relay server carries the connection | | `Last WireGuard handshake` | A recent handshake means the tunnel itself is healthy, relayed or not | -If the connection type is `P2P` but the link is slow, stop here — that's a different problem (path quality, MTU, or load), not a relay issue. Start from the [general client troubleshooting page](/help/troubleshooting-client#net-bird-agent-status) instead. +If the connection type is `P2P` but the link is slow, stop here. That's a different problem (path quality, MTU, or load), not a relay issue. Start from the [general client troubleshooting page](/help/troubleshooting-client#net-bird-client-status) instead. ## The mental model -If you remember one thing, remember this: **a relayed connection is not a failure — it's the safety net after a failed hole punch.** Your job is to find out which of two worlds you're in: +If you remember one thing, remember this: **a relayed connection is not a failure; it's the safety net after a failed hole punch.** Your job is to find out which of two worlds you're in: ``` Why is this connection relayed? @@ -58,23 +58,23 @@ If you remember one thing, remember this: **a relayed connection is not a failur │ │ A fixable blocker An unfixable NAT (Signal, STUN, or UDP is (both sides scramble ports - blocked somewhere — find per destination — relay is + blocked somewhere, find per destination, relay is it and remove it) doing its designed job) ``` -You can't ask NetBird to measure NAT behavior directly, so you work by **elimination**: first triage for environments that are known to defeat hole punching, then clear the fixable blockers one by one. If every check passes on both peers and the connection is still relayed, you have proven the NAT is the cause — and the relay is the designed answer, not a problem left to fix. +You can't ask NetBird to measure NAT behavior directly, so you work by **elimination**: first triage for environments that are known to defeat hole punching, then clear the fixable blockers one by one. If every check passes on both peers and the connection is still relayed, you have proven the NAT is the cause, and the relay is the designed answer, not a problem left to fix. ## The four players Four things decide whether a connection goes direct. Each one maps to exactly one check in the flow below. -**NAT — the obstacle.** Routers rewrite addresses, so a peer behind NAT can't receive unsolicited traffic. Most NATs hand out a *predictable* public address that hole punching can use; symmetric NATs and carrier-grade NAT (CGNAT) hand out a *different* one per destination, which defeats hole punching entirely. The theory lives in [Understanding NAT and Connectivity](/about-netbird/understanding-nat-and-connectivity). +**NAT, the obstacle.** Routers rewrite addresses, so a peer behind NAT can't receive unsolicited traffic. Most NATs hand out a *predictable* public address that hole punching can use; symmetric NATs and carrier-grade NAT (CGNAT) hand out a *different* one per destination, which defeats hole punching entirely. The theory lives in [Understanding NAT and Connectivity](/about-netbird/understanding-nat-and-connectivity). -**Signal — the messenger.** Peers exchange their candidate addresses through the Signal service (`signal.netbird.io`, TCP/443). If Signal is unreachable, the peers can't even compare notes, and the connection silently lands on the relay. +**Signal, the messenger.** Peers exchange their candidate addresses through the Signal service (`signal.netbird.io`, TCP/443). If Signal is unreachable, the peers can't even compare notes, and the connection silently lands on the relay. -**STUN — the mirror.** A peer discovers its own public address by asking a STUN server (`stun.netbird.io`, UDP 80, 443, 3478, 5555). If outbound UDP to STUN is blocked, the peer never learns a public candidate and hole punching never starts. +**STUN, the mirror.** A peer discovers its own public address by asking a STUN server (`stun.netbird.io`, UDP 80, 443, 3478, 5555). If outbound UDP to STUN is blocked, the peer never learns a public candidate and hole punching never starts. -**Relay — the safety net.** When no direct path works, both peers connect outbound to a relay (`*.relay.netbird.io`, TCP/443) and traffic flows through it, still end-to-end encrypted. +**Relay, the safety net.** When no direct path works, both peers connect outbound to a relay (`*.relay.netbird.io`, TCP/443) and traffic flows through it, still end-to-end encrypted. The authoritative endpoint and port list is in [Ports & Firewalls](/about-netbird/ports-and-firewalls). @@ -99,7 +99,7 @@ Work through these steps in order. Each one tells you when to continue and when ``` Confirm it's Relayed │ -1. Environment triage — are BOTH peers on known-symmetric networks? +1. Environment triage: are BOTH peers on known-symmetric networks? │ yes → relay is expected, stop here │ no / unsure 2. Control plane reachable? (Signal + Management, TCP/443) @@ -112,34 +112,34 @@ Confirm it's Relayed relay is the designed path ``` -### Step 1 — Environment triage +### Step 1: Environment triage Some networks are known to defeat hole punching, no matter how clean the firewall config is. Before checking anything else, ask where each peer sits: -- **Mobile and cellular connections** — carriers use CGNAT, which usually behaves symmetrically. -- **Cloud NAT gateways** (AWS NAT Gateway, GCP Cloud NAT) — symmetric by design for instances without a public IP. -- **Enterprise firewalls in strict mode** — Cisco ASA, Palo Alto, Fortinet and similar devices often default to symmetric NAT, sometimes labeled "strict NAT" in their settings. +- **Mobile and cellular connections**: carriers use CGNAT, which usually behaves symmetrically. +- **Cloud NAT gateways** (AWS NAT Gateway, GCP Cloud NAT): symmetric by design for instances without a public IP. +- **Enterprise firewalls in strict mode**: Cisco ASA, Palo Alto, Fortinet and similar devices often default to symmetric NAT, sometimes labeled "strict NAT" in their settings. -If **both** peers sit on networks like these, hole punching can't succeed and no amount of firewall tuning will change that — the relay is the expected outcome, and you can stop here (see [when relay is the right answer](#when-relay-is-the-right-answer)). If only one side does, or you're not sure, keep going: one predictable side is usually enough for P2P. +If **both** peers sit on networks like these, hole punching can't succeed and no amount of firewall tuning will change that. The relay is the expected outcome, and you can stop here (see [when relay is the right answer](#when-relay-is-the-right-answer)). If only one side does, or you're not sure, keep going: one predictable side is usually enough for P2P. -A CGNAT tell: the public address your network presents is in `100.64.0.0/10`, a range reserved for carrier-grade NAT. Don't confuse it with your own NetBird IP — NetBird intentionally uses the same range for its overlay network, so only the address your *ISP-facing* connection shows counts. +A CGNAT tell: the public address your network presents is in `100.64.0.0/10`, a range reserved for carrier-grade NAT. Don't confuse it with your own NetBird IP. NetBird intentionally uses the same range for its overlay network, so only the address your *ISP-facing* connection shows counts. -### Step 2 — Is the control plane reachable? +### Step 2: Is the control plane reachable? -If a peer can't reach the Signal service, candidates are never exchanged and the connection goes straight to relay — a common silent cause. From the peer, confirm outbound TCP/443 to both control-plane endpoints: +If a peer can't reach the Signal service, candidates are never exchanged and the connection goes straight to relay, a common silent cause. From the peer, confirm outbound TCP/443 to both control-plane endpoints: ```bash curl -sf https://api.netbird.io/api > /dev/null && echo "management: OK" nc -zv signal.netbird.io 443 ``` -Both must succeed. If they don't, fix outbound TCP/443 to these endpoints first — nothing else matters until the peers can talk to the control plane. +Both must succeed. If they don't, fix outbound TCP/443 to these endpoints first, nothing else matters until the peers can talk to the control plane. -### Step 3 — Is STUN reachable? +### Step 3: Is STUN reachable? -Hole punching starts with STUN, and STUN runs over UDP. The best evidence is already in `netbird status -d` — the `Relays:` section near the bottom reports reachability of every STUN, TURN, and relay endpoint: +Hole punching starts with STUN, and STUN runs over UDP. The best evidence is already in `netbird status -d`. The `Relays:` section near the bottom reports reachability of every STUN, TURN, and relay endpoint: ``` Relays: @@ -148,19 +148,19 @@ Relays: [rels://us-nyc-2.relay.netbird.io:443] is Available ``` -Any `Unavailable` entry for a `stun:` or `turn:` endpoint means outbound UDP is being dropped on the path — typically by the site's egress firewall. Ask whoever runs it to allow outbound UDP on ports 80, 443, 3478, and 5555 to `stun.netbird.io` and `turn.netbird.io`; the exact list and example rules are in [Ports & Firewalls](/about-netbird/ports-and-firewalls#outgoing-ports). +Any `Unavailable` entry for a `stun:` or `turn:` endpoint means outbound UDP is being dropped on the path, typically by the site's egress firewall. Ask whoever runs it to allow outbound UDP on ports 80, 443, 3478, and 5555 to `stun.netbird.io` and `turn.netbird.io`; the exact list and example rules are in [Ports & Firewalls](/about-netbird/ports-and-firewalls#outgoing-ports). Every fix on this page is an **outbound** firewall rule or an allowance on the host's `wt0` interface. NetBird never needs an inbound port opened on your perimeter firewall. -### Step 4 — Is a host firewall in the way? +### Step 4: Is a host firewall in the way? -The host's own firewall or security software can block UDP before it ever leaves the machine. Telltale symptoms: peers show `Connected` but can't be pinged, or two peers on the *same office LAN* connect relayed because a host firewall drops their unsolicited direct packets. Both cases, with platform-specific checks and fixes for UFW, firewalld, and Windows Firewall, are covered in [Ports & Firewalls — Host-based firewalls](/about-netbird/ports-and-firewalls#host-based-firewalls). +The host's own firewall or security software can block UDP before it ever leaves the machine. Telltale symptoms: peers show `Connected` but can't be pinged, or two peers on the *same office LAN* connect relayed because a host firewall drops their unsolicited direct packets. Both cases, with platform-specific checks and fixes for UFW, firewalld, and Windows Firewall, are covered in [Ports & Firewalls: Host-based firewalls](/about-netbird/ports-and-firewalls#host-based-firewalls). -Endpoint protection software (CrowdStrike, ESET, Sophos and similar) often ships its own firewall that overrides OS rules — if connectivity works with it temporarily disabled, add an exception for the NetBird process. +Endpoint protection software (CrowdStrike, ESET, Sophos and similar) often ships its own firewall that overrides OS rules, if connectivity works with it temporarily disabled, add an exception for the NetBird process. -### Step 5 — Repeat on the other peer +### Step 5: Repeat on the other peer A connection has two ends, and **both** must pass steps 2–4 for hole punching to work. A perfectly clean laptop still gets a relayed connection if the server's egress firewall silently drops UDP. Run the same checks on the second peer before drawing any conclusion. @@ -168,9 +168,9 @@ A connection has two ends, and **both** must pass steps 2–4 for hole punching Can't shell into the far peer? Administrators can trigger a debug bundle remotely from the dashboard (**Peers → select peer → Run Remote Job**). See [remote debug bundle generation](/help/troubleshooting-client#remote-debug-bundle-generation). -### Step 6 — Conclude, or escalate +### Step 6: Conclude, or escalate -If every check passes on both peers and the connection is still relayed, you've proven by elimination that a symmetric NAT is in the path. Accept the relay — it's the [designed behavior for exactly this case](#when-relay-is-the-right-answer), and it costs latency, not security. +If every check passes on both peers and the connection is still relayed, you've proven by elimination that a symmetric NAT is in the path. Accept the relay. It's the [designed behavior for exactly this case](#when-relay-is-the-right-answer), and it costs latency, not security. If instead something looks wrong but you can't place it, collect evidence and escalate: @@ -182,8 +182,8 @@ If instead something looks wrong but you can't place it, collect evidence and es A remote engineer's laptop reaches `build-server` in the office, but `netbird status -d` on the laptop shows the connection is relayed and latency is poor. Working the flow: -1. **Confirm.** The laptop shows `Connection type: Relayed` and `ICE candidate (Local/Remote): srflx/relay`. The laptop's own side reached STUN fine (`srflx`) — the weak side is the server. -2. **Triage.** The laptop is on home fiber, the server on the office LAN. Neither is mobile, CGNAT, or behind a cloud NAT gateway — so this should be fixable. Continue. +1. **Confirm.** The laptop shows `Connection type: Relayed` and `ICE candidate (Local/Remote): srflx/relay`. The laptop's own side reached STUN fine (`srflx`); the weak side is the server. +2. **Triage.** The laptop is on home fiber, the server on the office LAN. Neither is mobile, CGNAT, or behind a cloud NAT gateway, so this should be fixable. Continue. 3. **Control plane, on the server.** `curl` to the Management API and `nc -zv signal.netbird.io 443` both succeed. 4. **STUN, on the server.** The `Relays:` section shows `[stun:stun.netbird.io:3478] is Unavailable, reason: stun request: context deadline exceeded`. The office egress firewall is dropping outbound UDP. 5. **Fix and verify.** IT allows outbound UDP on ports 80, 443, 3478, 5555 to `stun.netbird.io` and `turn.netbird.io`. On the server, restart the connection with `netbird down && netbird up`, then re-check from the laptop. @@ -195,7 +195,7 @@ A remote engineer's laptop reaches `build-server` in the office, but `netbird st ``` -Both sides now discover their public addresses, hole punching succeeds, and traffic flows directly — latency drops from ~90 ms to ~15 ms with no relay in the path. +Both sides now discover their public addresses, hole punching succeeds, and traffic flows directly; latency drops from ~90 ms to ~15 ms with no relay in the path. ## When relay is the right answer @@ -204,13 +204,13 @@ Stop troubleshooting and accept the relay when: - **Both peers are on mobile/CGNAT connections.** The carrier's NAT is symmetric and outside anyone's control. - **Corporate policy blocks outbound UDP and won't change.** Relay over TCP/443 is the designed path through such networks. -- **A cloud NAT gateway can't be re-architected.** If the instance *can* get a public IP (an Elastic IP on AWS), that restores P2P without opening anything inbound — security groups still only need outbound rules, and the gateway's symmetric NAT drops out of the path. If it can't, relay it is. -- **The NAT device belongs to someone else** — a hotel, a café, a customer site. +- **A cloud NAT gateway can't be re-architected.** If the instance *can* get a public IP (an Elastic IP on AWS), that restores P2P without opening anything inbound, security groups still only need outbound rules, and the gateway's symmetric NAT drops out of the path. If it can't, relay it is. +- **The NAT device belongs to someone else**: a hotel, a café, a customer site. Some teams even prefer relayed connections in locked-down networks, because the only flows leaving the perimeter are outbound TCP/443. That's a legitimate posture: the cost is latency, never confidentiality. In all of these cases, relay is NetBird working as designed, not a fault. -Guides elsewhere sometimes suggest forwarding a UDP port to a peer to force P2P past a symmetric NAT. It can work — but it gives up NetBird's core promise that you never open an inbound port on your network. Accept the relayed connection instead; carrying traffic past unfixable NATs without exposing anything is exactly what it's for. +Guides elsewhere sometimes suggest forwarding a UDP port to a peer to force P2P past a symmetric NAT. It can work, but it gives up NetBird's core promise that you never open an inbound port on your network. Accept the relayed connection instead; carrying traffic past unfixable NATs without exposing anything is exactly what it's for. ## Rollout checklist @@ -219,13 +219,13 @@ To keep a whole fleet on direct connections rather than fixing peers one at a ti - **Allow outbound UDP to STUN/TURN** (`stun.netbird.io`, `turn.netbird.io`, ports 80, 443, 3478, 5555) at every site's egress firewall. - **Wildcard `*.relay.netbird.io` on TCP/443** so the relay fallback survives rotation of the geo-distributed relay pool. -- **Watch the `Relays:` section** of `netbird status -d` during rollout — fix `Unavailable` entries before users report slowness. +- **Watch the `Relays:` section** of `netbird status -d` during rollout, fix `Unavailable` entries before users report slowness. - **Bake the `wt0` allowance into host-firewall baselines** (UFW/firewalld/Windows images), so host firewalls never silently block decrypted traffic. -- **Decide per site whether relay is acceptable**, and document it — a deliberate relay is fine; a surprising one costs a support ticket. +- **Decide per site whether relay is acceptable**, and document it, a deliberate relay is fine; a surprising one costs a support ticket. ## Recap -In one breath: **NAT** is the obstacle, **Signal** is the messenger, **STUN** is the mirror, and **Relay** is the safety net. A relayed connection means hole punching failed — either because something fixable blocks Signal, STUN, or UDP (find it: control plane → STUN → host firewall → both ends), or because both peers sit behind symmetric NAT, which you prove by elimination. Every fix is an outbound rule; nothing is ever opened inbound. And when the NAT itself is the cause, the relay is doing exactly the job it was built for — keeping peers connected without exposing anything, with end-to-end encryption intact. +In one breath: **NAT** is the obstacle, **Signal** is the messenger, **STUN** is the mirror, and **Relay** is the safety net. A relayed connection means hole punching failed, either because something fixable blocks Signal, STUN, or UDP (find it: control plane → STUN → host firewall → both ends), or because both peers sit behind symmetric NAT, which you prove by elimination. Every fix is an outbound rule; nothing is ever opened inbound. And when the NAT itself is the cause, the relay is doing exactly the job it was built for, keeping peers connected without exposing anything, with end-to-end encryption intact. +NetBird's job ends at the resource's `IP:port`. Once the TCP handshake there completes, NetBird has delivered the traffic. A reset, an empty reply, or a TLS or auth error after that point lives in the application or the host, not in NetBird. + + +## First, confirm where it stops + +On the client, check the routing peer and the route in detail: + +```bash +netbird status -d +``` + +Find the routing peer, confirm the tunnel is healthy, and check that the resource address falls inside a range the peer actually routes: + +``` + routing-peer-1.netbird.cloud: + NetBird IP: 100.92.0.12 + Status: Connected + -- detail -- + Connection type: P2P + Last WireGuard handshake: 12 seconds ago + Networks: 10.0.50.0/24 +``` + +| Field | What it tells you | +|---|---| +| `Status: Connected` | The tunnel to the routing peer is up, so the problem is past the peer | +| `Connection type` | If `Relayed`, fix transport first (see the relayed-connections page) | +| `Networks` | The ranges this peer routes; your resource's address must fall inside one | +| `Last WireGuard handshake` | A recent handshake means the tunnel itself is healthy | + +If the peer is not `Connected`, stop here. That is a transport problem, not a resource one, so start from the [general client troubleshooting page](/help/troubleshooting-client#net-bird-client-status). + +## The mental model + +If you remember one thing, remember this: **NetBird carries traffic through the tunnel to the resource's `IP:port`, and nothing past it.** Your job is to find out which of two worlds you are in: + +``` + Can the client reach the resource? + │ + ┌───────────────────┴───────────────────┐ + │ │ + NetBird isn't delivering NetBird delivered fine + (peer, DNS, route, ACL, or (the handshake to IP:port + forwarding on the peer; find completed; a reset or error + it and fix it) after that is the app or host) +``` + +The dividing line is the **TCP handshake to the resource's `IP:port`**. Before it completes, the problem is somewhere in NetBird's path. Once it completes, NetBird has done its job, and a reset, an immediate close, an empty response, a TLS or cert error, an auth failure, or a wrong upstream target lives in the application or the host. You work this by **elimination**, top to bottom: confirm the peer is connected, that the name resolves, that a route and policy exist, that the peer forwards the traffic, and finally where the handshake succeeds or dies. + +## The five layers + +Five things sit between the client and the resource. Each one maps to exactly one check in the flow below. + +**Peer connectivity.** The client needs a healthy tunnel to the routing peer. If it doesn't, nothing downstream matters, and if the link is merely *relayed*, that belongs on a separate page. + +**DNS.** For a domain resource, the name resolves **on the routing peer**, using the peer's own resolver. The peer has to be able to resolve it. See [DNS Troubleshooting](/manage/dns/troubleshooting). + +**Route and ACL.** The resolved address has to be installed as a route on the client, and a policy has to permit the exact **protocol and port**. Policies are per-protocol, so a TCP-only policy silently drops the UDP half of a protocol that uses both. See [Access Control](/manage/access-control) and the forward-chain vs input-chain note in [Networks](/manage/networks). + +**Forwarding on the peer.** The routing peer has to put the packet onto its LAN toward the resource. One subtlety causes most of these cases. If the resource resolves to the **routing peer's own IP** (a service running on the peer itself), that is the *input chain*, not the *forward chain*. It needs a peer-to-peer policy, and for userspace peers it also needs `NB_ENABLE_LOCAL_FORWARDING=true` (see [Environment Variables](/client/environment-variables)). A subnet or host resource on a *different* LAN machine uses the forward chain and masquerade instead. + +**The handoff.** Once a TCP handshake to `IP:port` completes, NetBird is done. + +## Reading where the handshake dies + +The fastest way to assign ownership is to capture the attempt and read the result: + +| What you see | Owner | +|---|---| +| Peer not `Connected`, stale WireGuard handshake | NetBird transport (see relayed-connections page) | +| Resource resolves to the wrong or empty IP | NetBird DNS, or the peer's upstream resolver | +| Route or allowed-IP missing for the resolved address | NetBird route | +| Right port blocked, or the UDP half dropped | NetBird ACL (protocol and port) | +| `SYN` leaves the peer's LAN NIC, no `SYN-ACK`, fails on-LAN too | Resource host, service, or its firewall | +| Handshake completes, then `RST`, `FIN`, empty, auth, or cert error | Application or gateway config | +| Works from the resource's own LAN, fails only via the tunnel | NetBird forwarding (escalate) | + +## The decision flow + +Work through these steps in order. Each one tells you when to continue and when to stop. + +``` +Peer Connected? ── no ──▶ fix transport (relayed-connections page) + │ yes + ▼ +1. Does the name resolve to the expected IP? (on the peer, for domain resources) +2. Is a route installed, and does a policy allow this protocol and port? +3. Does the peer forward it? (forward chain vs self-targeted local forwarding) +4. Does the handshake to the resource IP:port complete? + ├─ yes ──▶ NetBird is done; investigate the application or host + └─ no ──▶ run the LAN bypass test: + ├─ fails on-LAN too ──▶ not NetBird (service, host, or firewall) + └─ works on-LAN ──▶ NetBird forwarding; escalate with captures +``` + + +For a hands-on, command-by-command version of these checks, with a worked example tracing a client to an internal server, see [Debugging access to network resources](/help/troubleshooting-client#debugging-access-to-network-resources). + + +### Step 1: Does the name resolve? + +For a domain resource, the name has to resolve to the expected address **on the routing peer**, because the peer is what resolves and forwards it. A name that resolves on your client but not on the peer, or that resolves to a stale or empty address, sends the traffic nowhere. Confirm the resolved address, then check it against the `Networks` ranges from the status output above. For resolver and routing problems, see [DNS Troubleshooting](/manage/dns/troubleshooting). + +### Step 2: Is there a route and a matching policy? + +The resolved address has to be installed as a route on the client, and a policy has to allow the **exact protocol and port**. A policy that allows TCP but not UDP silently drops the UDP half of a protocol that needs both, which looks like a partial outage. + + +`netbird debug trace` simulates a packet through the firewall rules without sending real traffic, so you can confirm an ACL verdict directly. For example: `netbird debug trace in 100.64.1.1 self -p tcp --dport 3389`. It is most useful on macOS and Windows. See [tracing firewall rules](/help/troubleshooting-client#tracing-firewall-rules). + + +### Step 3: Does the peer forward it? + +The routing peer has to put the packet onto its LAN. The common trap is a resource that resolves to the routing peer's own IP: that path is the *input chain*, not the *forward chain*. It needs a peer-to-peer policy, and on userspace peers it also needs `NB_ENABLE_LOCAL_FORWARDING=true` ([Environment Variables](/client/environment-variables)). A resource on a different LAN machine uses the forward chain and masquerade, and is covered by the forward-chain vs input-chain note in [Networks](/manage/networks). + +### Step 4: Where does the handshake die? + +This is the dividing line. Capture the connection attempt and watch the TCP handshake to the resource's `IP:port`. + +- If the handshake **completes**, NetBird delivered the traffic. A reset, an empty reply, or a TLS or auth error after that belongs to the application or host. +- If there is **no `SYN-ACK`**, run the LAN bypass test below to decide whether NetBird or the resource owns it. + + +The LAN bypass test: connect to the resource's `IP:port` from another host on its own subnet, outside NetBird entirely. If it fails there too, it was never NetBird. If it works there but not through the tunnel, the problem is NetBird forwarding, so escalate. + + +### Step 5: Conclude, or escalate + +If the handshake completes, or the resource fails on its own LAN too, the answer is in the application or host, not in NetBird. If the resource is reachable on its own LAN but never through the tunnel, you have a NetBird forwarding problem worth escalating. Collect evidence before you do: + +- A [debug bundle](/help/troubleshooting-client#debug-bundle) from the client and, where possible, the routing peer: `netbird debug bundle --system-info` +- The `netbird status -d` output showing the peer and its `Networks` +- A packet capture from the routing peer's LAN NIC showing the `SYN` leaving and no `SYN-ACK` returning + +## Walkthrough: a gateway that closes the connection + +A client reaches a Windows routing peer running a remote-desktop gateway, but the RDP session never opens. The mirror setup on an adjacent gateway works with an identical policy. Working the flow: + +1. **Peer and DNS.** The peer is `Connected`, and the resource resolves to the peer's own LAN IP, so the gateway runs *on* the routing peer. +2. **Route and ACL.** The resolved `/32` is installed on the client and the policy permits the gateway's ports. `netbird debug trace` confirms ACCEPT. +3. **Forwarding.** `NB_ENABLE_LOCAL_FORWARDING=true` is set, matching the working mirror, so local delivery is enabled. +4. **The handshake.** A client-side capture shows the TCP handshake to the gateway `IP:port` completing, the client sending its first message, and the gateway immediately closing the connection with zero bytes returned. + +The handshake completed, so NetBird delivered the traffic correctly. The gateway itself rejected the session. + + +Root cause: an incorrect upstream destination was registered on the gateway. Nothing in the NetBird path was at fault, which is why an identical policy on the mirror behaved differently. + + +## When it's not NetBird + +Stop looking at NetBird and investigate the resource side when: + +- **The handshake to `IP:port` completes** but the application resets, closes, returns nothing, or errors on TLS or auth. That is the service. +- **The LAN bypass test fails too.** The service, its bind address, or the host firewall is the problem, independent of NetBird. +- **A proxy or gateway on the routing peer points at a wrong or dead upstream.** That is its own configuration, not the tunnel. + +NetBird supports the overlay: transport, routing, ACLs, and DNS. The proxies, firewalls, and applications behind your resources sit outside it, and are almost always verifiable locally on the resource side. The [Reverse Proxy troubleshooting](/manage/reverse-proxy/troubleshooting) page states the same boundary for proxy targets. + +## Checklist + +To resolve these quickly and avoid surprises across a fleet: + +- **Confirm the peer is `Connected` and not `Relayed`** before looking any further downstream. +- **Resolve the name on the routing peer**, not only on the client, for domain resources. +- **Match the policy to the exact protocol and port**, and remember that TCP-only policies drop the UDP half. +- **Know whether the resource is self-targeted** (input chain, local forwarding) or on a separate LAN machine (forward chain, masquerade). +- **Use the handshake as the verdict.** If it completes, hand the issue to the application or host owner with the capture attached. + +## Recap + +In one breath: **NetBird carries traffic to the resource's `IP:port`, and the handshake there is the dividing line.** Work top to bottom: peer connected, then name resolves, then route and policy exist, then the peer forwards it, then the handshake. If the handshake never completes and the resource is reachable on its own LAN, it is a NetBird delivery problem worth escalating. If the handshake completes, or the resource fails on its own LAN too, the answer is in the application or host, and that is where to look. + + diff --git a/src/pages/help/troubleshooting.mdx b/src/pages/help/troubleshooting.mdx new file mode 100644 index 00000000..f9514d4a --- /dev/null +++ b/src/pages/help/troubleshooting.mdx @@ -0,0 +1,96 @@ +import { TroubleshootingTiles } from "@/components/TroubleshootingTiles" +import { StillStuck } from "@/components/StillStuck" + +export const description = + "Start here to troubleshoot NetBird. Find your issue by area and jump straight to the relevant guide." + +# Troubleshooting + +Something not working as expected? Pick the area closest to your problem and jump to the +guide and section that covers it. Each section links to troubleshooting docs that live +next to the feature they cover, so nothing here is a copy. + + + + diff --git a/src/pages/manage/dns/troubleshooting.mdx b/src/pages/manage/dns/troubleshooting.mdx index 79c4cec3..52a7ed27 100644 --- a/src/pages/manage/dns/troubleshooting.mdx +++ b/src/pages/manage/dns/troubleshooting.mdx @@ -6,6 +6,8 @@ export const description = 'Diagnose and fix common DNS issues in NetBird' This guide helps you diagnose and resolve common DNS issues in NetBird. Follow the structured approach below to identify and fix problems quickly. +If a name resolves correctly but a service behind a routing peer is still unreachable, the problem is past DNS. See [Troubleshooting resource connectivity](/help/troubleshooting-resource-connectivity) to find which layer is dropping the traffic. + The steps below use `ping` to check reachability. `ping` requires an **ICMP** peer-to-peer policy — when testing through or to a [routing peer](/manage/networks/how-routing-peers-work), name resolution and TCP can be healthy while ICMP is simply not allowed by policy. Prefer a TCP port test there: `Test-NetConnection -Port ` (Windows) or `nc -z `. @@ -473,6 +475,37 @@ A domain resource only resolves `A`/`AAAA` records. Active Directory also depend --- +### Issue 8: Windows NRPT rule is written but never takes effect (lingering GPO) + +**Symptoms**: +- Match-domain names don't resolve on a Windows client, even though `netbird status -d` shows the nameserver as Available and the client log records the NRPT (Name Resolution Policy Table) rule as written. +- The machine is off-domain (not currently on the company network), often a remote or personal device that was once domain-joined. + +**Diagnosis**: + +NetBird writes its NRPT rule correctly, but a stale Windows Group Policy Object (GPO) `DnsPolicyConfig` container forces the rule into the *policy* store. On an off-domain machine Windows does not apply policy-store NRPT rules, so the rule exists but is never effective. The write succeeds while Windows quietly drops it from resolution. + +From a [debug bundle](/help/troubleshooting-client#debug-bundle), `client.log` and the matching entry in `state.json` show whether the NetBird client detected a GPO in place. + +Then compare the rule NetBird wrote against what Windows is actually applying (PowerShell): + +```powershell +Get-DnsClientNrptRule # the rule NetBird wrote +Get-DnsClientNrptPolicy -Effective # what Windows is actually applying +``` + +If the rule appears in `Get-DnsClientNrptRule` but not in the `-Effective` output, a lingering GPO container is blocking it. + +**Solutions**: + +This is a Windows Group Policy state issue, not a NetBird misconfiguration, and the fix has to happen locally: +- Have local IT clear the stale `DnsPolicyConfig` GPO container from the machine's registry, or +- Connect the machine to the company network and run `gpupdate /force` so Windows reconciles and removes the lingering container. + +NetBird cannot remove a Group Policy container from the client side. + +--- + ## Verifying Configuration ### Public nameservers diff --git a/src/pages/manage/networks/how-routing-peers-work.mdx b/src/pages/manage/networks/how-routing-peers-work.mdx index 1b6c88a0..a805e6d0 100644 --- a/src/pages/manage/networks/how-routing-peers-work.mdx +++ b/src/pages/manage/networks/how-routing-peers-work.mdx @@ -213,6 +213,8 @@ Specifics: ## Observability and troubleshooting +When a peer is `Connected` but a resource behind it is still unreachable, [Troubleshooting resource connectivity](/help/troubleshooting-resource-connectivity) walks these checks in order, from the peer through to the TCP handshake at the resource. + - `netbird status -d` shows the connection type, status, and details for every peer the client is connected to. - `netbird networks ls` shows which networks the client is currently using. - `netbird debug trace` simulates a packet against the firewall rules without sending real traffic. Useful when policies look right but traffic is dropped. On Linux this is most informative when the kernel firewall backend is active. diff --git a/src/pages/manage/networks/index.mdx b/src/pages/manage/networks/index.mdx index 16c76e29..c6159270 100644 --- a/src/pages/manage/networks/index.mdx +++ b/src/pages/manage/networks/index.mdx @@ -98,6 +98,8 @@ Here is the whole path, end to end: One subtlety causes most policy mistakes. A network resource policy permits traffic *through* the routing peer to the resources behind it: the routing peer's **forward chain**. Reaching a service running *on* the routing peer itself (SSH, a dashboard) is the **input chain**, and that needs a separate peer-to-peer [access policy](/manage/access-control) with the routing peer's group as the destination. If users need both, create one policy of each kind. +If a resource stays unreachable even though the policy looks right, [Troubleshooting resource connectivity](/help/troubleshooting-resource-connectivity) helps you find which layer is dropping the traffic. + ``` THROUGH the routing peer → forward chain → network resource policy your client ──tunnel──► routing peer ──────► resource behind it diff --git a/src/pages/manage/networks/use-cases/reach-services-on-the-routing-peer.mdx b/src/pages/manage/networks/use-cases/reach-services-on-the-routing-peer.mdx index cff296aa..ae959369 100644 --- a/src/pages/manage/networks/use-cases/reach-services-on-the-routing-peer.mdx +++ b/src/pages/manage/networks/use-cases/reach-services-on-the-routing-peer.mdx @@ -37,6 +37,7 @@ With these in place, a mapped drive to the share works unchanged — reached at - Test reachability with `Test-NetConnection -Port ` (or `nc -z`), not `ping`. ICMP needs its own peer-to-peer policy; a service working while `ping` fails is the signature of a missing ICMP policy, not a broken route. - The service should now answer at the routing peer's LAN IP over the tunnel — for a file server, a mapped drive opens unchanged. +- Still unreachable? [Troubleshooting resource connectivity](/help/troubleshooting-resource-connectivity) isolates whether the problem is DNS, the route, the policy, forwarding on the peer, or the service itself. ## Fallback: resolve the name to the NetBird IP diff --git a/src/pages/manage/reverse-proxy/troubleshooting.mdx b/src/pages/manage/reverse-proxy/troubleshooting.mdx index f735efb5..188e1653 100644 --- a/src/pages/manage/reverse-proxy/troubleshooting.mdx +++ b/src/pages/manage/reverse-proxy/troubleshooting.mdx @@ -6,6 +6,8 @@ export const description = 'Troubleshoot common reverse proxy issues like 502 er This guide helps you diagnose and resolve common reverse proxy issues in NetBird. Follow the structured approach below to identify and fix problems quickly. +If a service behind a routing peer is unreachable in general (not specific to the reverse proxy), see [Troubleshooting resource connectivity](/help/troubleshooting-resource-connectivity) to find which layer is dropping the traffic. + ## Quick Diagnostics Checklist Before diving deep, run through this quick checklist: diff --git a/src/pages/selfhosted/external-reverse-proxy.mdx b/src/pages/selfhosted/external-reverse-proxy.mdx index 62f7798a..dd9191cb 100644 --- a/src/pages/selfhosted/external-reverse-proxy.mdx +++ b/src/pages/selfhosted/external-reverse-proxy.mdx @@ -68,7 +68,7 @@ After you answer the prompts, the script writes a `docker-compose.yml` with the The readiness check probes `https://your-domain/oauth2/.well-known/openid-configuration` through the proxy. The script auto-detects your Traefik container (any container running a Traefik image with ports 80 and 443 published) so it can pull diagnostic logs if something goes wrong. -If the wait check hangs, see [Installation Script Issues](/selfhosted/troubleshooting#installation-script-issues) for the common causes. +If the wait check hangs, see [Installation script issues](/selfhosted/troubleshooting/installation) for the common causes. ### Existing Deployments diff --git a/src/pages/selfhosted/troubleshooting.mdx b/src/pages/selfhosted/troubleshooting.mdx index 94d18150..102b4cdd 100644 --- a/src/pages/selfhosted/troubleshooting.mdx +++ b/src/pages/selfhosted/troubleshooting.mdx @@ -1,255 +1,102 @@ -# Troubleshooting - -This page will help with various issues when self-hosting NetBird. - -## Embedded IdP Issues - -### Setup page not accessible - -**Problem**: You can't access the `/setup` page to create the first user. - -**Solutions**: -- The `/setup` page is only available when no users exist. If you've already created a user, use the regular login page. -- Check that the embedded IdP is enabled in your configuration. -- Verify the Management service is running: `docker compose logs management` - -### "Setup already completed" error (HTTP 412) - -**Problem**: The setup endpoint returns a 412 error. - -**Solution**: Setup has already been completed. The first user was already created. Use the regular login page to sign in. - -### Password not working after user creation - -**Problem**: You created a user but the password doesn't work. - -**Solutions**: -- Passwords are shown only once during user creation. If you didn't save it, you'll need to delete and recreate the user. -- Via API, you can create a new user with a new password. -- For the owner account, you may need to reset the database if no other admins exist. - -### SSO connector not appearing on login page - -**Problem**: You configured an identity provider but it doesn't show on the login page. - -**Solutions**: -1. Verify the connector was saved: Go to **Settings** → **Identity Providers** -2. Check that the redirect URL is correctly configured in your IdP -3. Review Management service logs for configuration errors: `docker compose logs management` -4. Ensure the IdP application has the correct redirect URI from NetBird - -### "Invalid redirect URI" error from IdP - -**Problem**: When clicking an SSO button, the IdP returns an invalid redirect URI error. - -**Solutions**: -1. Copy the exact redirect URL from NetBird (shown after saving the connector) -2. Add it to your IdP's allowed redirect URIs -3. Check for trailing slashes or typos -4. Some IdPs are case-sensitive - -### Identity Providers tab not visible - -**Problem**: You don't see the Identity Providers tab in Settings. - -**Solution**: This tab is only visible when the embedded IdP is enabled. Check your deployment configuration: -- For quickstart deployments, the embedded IdP should be enabled by default -- For the combined setup, the embedded IdP is always enabled -- For older multi-container deployments, ensure `EmbeddedIdP.Enabled` is set to `true` in `management.json` - -### Users not syncing from SSO provider - -**Problem**: Users who authenticate via SSO don't appear in the user list. - -**Solutions**: -- Users appear after their first successful login, not immediately after connector configuration -- Check that the SSO flow completes successfully (user should land on Dashboard) -- Review Management logs for any token validation errors - -## Installation Script Issues - -### Script hangs on "Waiting for NetBird server to become ready" - -**Problem**: The `getting-started.sh` script stays on the wait line for several minutes, even though netbird-server itself looks healthy in `docker ps`. - -The wait check probes the OIDC endpoint through your reverse proxy, so a healthy server alone isn't enough. Something on the path from the public internet to netbird-server is broken. Check these in order: - -**1. DNS** - -Confirm your domain points at the host: - -```bash -dig +short netbird.example.com -``` - -The result should be the public IP of the VM running NetBird. If it's empty or wrong, the cert challenge can't complete and the probe will never succeed. - -**2. Cert issuance** - -For options 0 (built-in Traefik) and 1 (existing Traefik), check that a real cert was issued: - -```bash -curl -vI https://netbird.example.com 2>&1 | grep -E "subject|issuer" -``` - -If you see a self-signed or default cert, ACME hasn't completed. Check Traefik logs for ACME errors. Common causes: port 80 not reachable from the internet, DNS not propagated yet, or Let's Encrypt rate limit hit. - -**3. Traefik can talk to the Docker socket (option 1 only)** - -Traefik discovers NetBird containers via Docker labels. If the Docker socket is mounted but Traefik can't read it (old SDK, wrong path, permission denied), routes never get created. Check Traefik logs for errors like `client version is too old` or `permission denied while trying to connect to the Docker daemon socket`. - -**4. Network membership matches (option 1 only)** - -Confirm the Traefik container and NetBird containers are actually on the same Docker network: - -```bash -docker network inspect | grep -A 2 Containers -``` - -You should see both Traefik and the NetBird containers listed. - -**5. Routing works end to end** +import { TroubleshootingStart } from "@/components/TroubleshootingStart" +import { TroubleshootingTiles } from "@/components/TroubleshootingTiles" +import { HashRedirect } from "@/components/HashRedirect" + +export const description = + "Troubleshoot a self-hosted NetBird deployment: collect diagnostics, then jump to the area that matches your issue." + + -If the four above look fine, test the full path manually: - -```bash -curl -k https://netbird.example.com/oauth2/.well-known/openid-configuration -``` - -A valid response is JSON containing `"issuer"`. Anything else (empty body, 404, 502, connection refused) tells you where to dig further: 502 means Traefik can't reach netbird-server, 404 means the labels aren't matched, connection refused means you're not hitting Traefik at all. - -You can let the script keep waiting while you debug. As soon as the probe succeeds, the script will continue. If you'd rather start over, kill it with Ctrl+C, run `docker compose down -v`, fix the issue, and re-run. - -### Script fails on existing installation check - -**Problem**: The script exits immediately with a message about generated files already existing. - -**Solution**: This is intentional and protects you from overwriting a working setup. To start fresh: - -```bash -docker compose down --volumes -rm -f docker-compose.yml dashboard.env config.yaml proxy.env \ - traefik-dynamic.yaml nginx-netbird.conf caddyfile-netbird.txt \ - npm-advanced-config.txt -``` - -Then re-run the script. Be aware this removes all NetBird data including users and peers. - -## Debugging TURN connections - -In the case that the peer-to-peer connection is not an option then the peer will use the TURN server for the secure connection establishment. If the connection is not possible even with TURN (Relay), -then we need to confirm that your turn configuration is correct and that it is available. - -To test your TURN configuration you can access the [online tester](https://webrtc.github.io/samples/src/content/peerconnection/trickle-ice). -There you will find a ICE servers input box, where you can select and remove the existing server, then add your turn server -configuration as follows: - -Please replace netbird.DOMAIN.com and PASSWORD with your STUN/TURN server details from your configuration (config.yaml for the combined setup, or the TURNConfig section in management.json for older multi-container setups), then click on Add server. - -

- turn -

- -You should see an output similar to the following: -

- turn -

-Where you have the following types: `host` (local address), `srflx` (STUN reflexive address), `relay` -(TURN relay address). If `srflx` and `relay` are not present then the TURN server is not working or not accessible and you should review the required ports in the [requirements section](/selfhosted/selfhosted-guide#requirements). - -## Dashboard Issues - -### Dashboard shows blank page - -**Problem**: The Dashboard loads but shows a blank page or errors. - -**Solutions**: -1. Check browser console for JavaScript errors (F12 → Console) -2. Verify the Dashboard can reach the Management API -3. Check CORS configuration if running behind a custom reverse proxy -4. Clear browser cache and try again - -### "Unauthorized" or "403" errors - -**Problem**: API calls return unauthorized or forbidden errors. - -**Solutions**: -1. Verify your authentication token is valid -2. Check that the user has appropriate permissions -3. For API access, ensure you're using a valid Personal Access Token (PAT) -4. Review Management service logs for detailed error information - -## Certificate Issues - -### Let's Encrypt certificate not renewing - -**Problem**: SSL certificate expires and doesn't auto-renew. - -**Solutions**: -1. Ensure port 80 is accessible for ACME challenge -2. Check Caddy logs: `docker compose logs caddy` -3. Verify the domain points to the correct IP -4. Manually trigger renewal: `docker exec -it netbird-caddy caddy reload` - -### Certificate errors with custom reverse proxy - -**Problem**: SSL errors when using your own reverse proxy. - -**Solutions**: -1. Ensure your reverse proxy terminates SSL correctly -2. Set `NETBIRD_DISABLE_LETSENCRYPT=true` in your configuration -3. Configure proper headers (X-Forwarded-For, X-Forwarded-Proto) -4. Verify HTTP/2 support is enabled for gRPC endpoints - -## Connection Issues - -### Peers can't connect to each other - -**Problem**: Peers are registered but can't establish connections. - -**Solutions**: -1. Check that UDP port 3478 is accessible (STUN/TURN) -2. Verify the TURN server is working (see [TURN debugging](#debugging-turn-connections)) -3. Check firewall rules on both peers -4. Review peer logs: `netbird status -d` - -### Management service unreachable - -**Problem**: Clients can't connect to the Management service. - -**Solutions**: -1. Verify port 443 is accessible -2. Check DNS resolution for your domain -3. Review Management logs: `docker compose logs management` -4. Test with curl: `curl -v https://your-domain.com/api/health` - -## Database Issues - -### Management service won't start after upgrade - -**Problem**: After upgrading, the Management service fails to start. - -**Solutions**: -1. Check logs for migration errors: `docker compose logs management` -2. Ensure you followed the [upgrade path](/selfhosted/selfhosted-quickstart#upgrade) for your version -3. Restore from backup if needed -4. For major version jumps, you may need intermediate upgrades - -### Data corruption after power loss - -**Problem**: Services don't start properly after unexpected shutdown. +# Troubleshooting -**Solutions**: -1. Check for database lock files -2. Review all service logs -3. Consider restoring from backup -4. For SQLite databases, you may need to run integrity checks +This page helps with issues when self-hosting NetBird. Collect the diagnostics below, then pick the area that matches your problem. + + + + ## Getting Help -If you're still experiencing issues: +If you're still experiencing issues, see [Report bugs and issues](/help/report-bug-issues) for the right channel: -1. **Check logs**: `docker compose logs` for all services -2. **Search existing issues**: [GitHub Issues](https://github.com/netbirdio/netbird/issues) -3. **Join our community**: [Slack Channel](/slack-url) -4. **Open an issue**: Include logs, configuration (without secrets), and steps to reproduce \ No newline at end of file +1. **Gather evidence first**: `docker compose logs` for all services, your configuration (without secrets), and the steps to reproduce. +2. **Open source self-hosted and general questions** go to [Community Support](/help/community-support): the [Slack Channel](/slack-url) for quick questions, or [GitHub Discussions](https://github.com/netbirdio/netbird/discussions) for a written record. +3. **Cloud customers and users, and commercial-license deployments** can reach the team through [NetBird Support](/help/netbird-support). diff --git a/src/pages/selfhosted/troubleshooting/certificates.mdx b/src/pages/selfhosted/troubleshooting/certificates.mdx new file mode 100644 index 00000000..c0cefd6f --- /dev/null +++ b/src/pages/selfhosted/troubleshooting/certificates.mdx @@ -0,0 +1,30 @@ +export const description = + "Self-hosted NetBird certificate troubleshooting: Let's Encrypt renewal and TLS errors behind a custom reverse proxy." + +# Certificate issues + +TLS and certificate problems on a self-hosted deployment. For other areas, start from [Troubleshooting](/selfhosted/troubleshooting). + +## Let's Encrypt certificate not renewing + +**Symptom**: The TLS certificate expires and does not auto-renew, so clients and browsers report an expired or invalid certificate. + +**Likely causes and fixes** (most common first): + +1. **Port 80 is not reachable from the internet.** The ACME HTTP challenge (how Let's Encrypt validates your domain) needs inbound TCP/80. Confirm your firewall and cloud security groups allow it. +2. **The domain no longer points at this host.** Verify the `A`/`AAAA` record resolves to the server's public IP. +3. **A renewal error in the proxy.** Check the certificate manager's logs: `docker compose logs caddy`. If needed, force a reload: `docker exec -it netbird-caddy caddy reload`. + +**Confirm**: `curl -vI https://YOUR_DOMAIN 2>&1 | grep -E "issuer|expire"` shows a current Let's Encrypt certificate. + +## Certificate errors with custom reverse proxy + +**Symptom**: TLS errors when terminating TLS on your own reverse proxy instead of the bundled one. + +**Likely causes and fixes** (most common first): + +1. **Let's Encrypt is still enabled, so two components fight over TLS.** Set `NETBIRD_DISABLE_LETSENCRYPT=true` so NetBird stops managing certificates and leaves termination to your proxy. +2. **Forwarded headers are missing.** Set `X-Forwarded-For` and `X-Forwarded-Proto` on the proxy so NetBird sees the original scheme and client. +3. **gRPC fails without HTTP/2.** The Management gRPC endpoints need HTTP/2; enable it on the proxy. + +**Confirm**: The dashboard loads over your proxy without TLS warnings, and `netbird status` from a client shows `Management: Connected`. diff --git a/src/pages/selfhosted/troubleshooting/connectivity.mdx b/src/pages/selfhosted/troubleshooting/connectivity.mdx new file mode 100644 index 00000000..d471b3ae --- /dev/null +++ b/src/pages/selfhosted/troubleshooting/connectivity.mdx @@ -0,0 +1,52 @@ +export const description = + "Self-hosted NetBird connectivity troubleshooting: testing the TURN server, peer-to-peer connection failures, and an unreachable Management service." + +# Connectivity issues + +Peer connectivity and relay problems on a self-hosted deployment. For other areas, start from [Troubleshooting](/selfhosted/troubleshooting). + +## Debugging TURN connections + +When peers can't establish a direct connection, they fall back to the TURN (relay) server. If the connection still fails with relay, confirm your TURN configuration is correct and reachable. + +To test it, open the [Trickle ICE test tool](https://webrtc.github.io/samples/src/content/peerconnection/trickle-ice). In the **ICE servers** box, remove the default server and add your TURN server. Replace netbird.DOMAIN.com and PASSWORD with your STUN/TURN details from your configuration (`config.yaml` for the combined setup, or the `TURNConfig` section in `management.json` for older multi-container setups), then click Add server. + +

+ The Trickle ICE test tool with a NetBird STUN/TURN server added to the ICE servers list +

+ +*Add your STUN/TURN server in the ICE servers box, then click Add server.* + +Gather candidates and read the result: + +

+ Trickle ICE test output listing host, srflx, and relay candidate types +

+ +*A working TURN server returns `srflx` and `relay` candidates.* + +The candidate types are `host` (local address), `srflx` (STUN reflexive address), and `relay` (TURN relay address). If `srflx` and `relay` are missing, the TURN server is not working or not reachable. Review the required ports in the [requirements section](/selfhosted/selfhosted-guide#requirements). + +## Peers can't connect to each other + +**Symptom**: Peers are registered and show in the dashboard, but can't establish a connection between them. + +**Likely causes and fixes** (most common first): + +1. **STUN/TURN is unreachable.** Confirm UDP/3478 (STUN/TURN) is open, then verify the relay path with the [TURN test above](#debugging-turn-connections). +2. **A host firewall is dropping traffic** on one or both peers. Check the local firewall rules on each peer. +3. **The peer itself is unhealthy.** Run `netbird status -d` on both peers and check the connection type and last handshake. + +**Confirm**: `netbird status -d` on both peers shows `Status: Connected` and a recent WireGuard handshake. + +## Management service unreachable + +**Symptom**: Clients can't register or stay connected, reporting that the Management service is unreachable. + +**Likely causes and fixes** (most common first): + +1. **TCP/443 is blocked** between the client and the server. Confirm the port is open end to end. +2. **DNS doesn't resolve your domain** to the server's public IP. Check resolution from the client. +3. **Management is down or erroring.** Review `docker compose logs management`, and test the endpoint directly: `curl -v https://YOUR_DOMAIN/api/health`. + +**Confirm**: `curl https://YOUR_DOMAIN/api/health` returns a healthy response, and `netbird status` shows `Management: Connected`. diff --git a/src/pages/selfhosted/troubleshooting/dashboard.mdx b/src/pages/selfhosted/troubleshooting/dashboard.mdx new file mode 100644 index 00000000..742cc144 --- /dev/null +++ b/src/pages/selfhosted/troubleshooting/dashboard.mdx @@ -0,0 +1,30 @@ +export const description = + "Self-hosted NetBird dashboard troubleshooting: blank pages and unauthorized or 403 errors." + +# Dashboard issues + +Problems with the self-hosted dashboard. For other areas, start from [Troubleshooting](/selfhosted/troubleshooting). + +## Dashboard shows blank page + +**Symptom**: The dashboard loads but renders a blank page, sometimes with errors in the browser console. + +**Likely causes and fixes** (most common first): + +1. **The dashboard can't reach the Management API.** Open the browser console (F12 → Console) and look for failed requests. Confirm the dashboard's configured API URL is correct and reachable from the browser. +2. **CORS (Cross-Origin Resource Sharing) is blocking the API behind a custom reverse proxy.** Serve the API on the same origin as the dashboard, or set the correct CORS headers on your proxy. +3. **A stale cached bundle.** Hard-reload the page or clear the browser cache. + +**Confirm**: Reload the dashboard. The login or peers view renders, and the console shows no failed API requests. + +## "Unauthorized" or "403" errors + +**Symptom**: API calls return 401 Unauthorized or 403 Forbidden. + +**Likely causes and fixes** (most common first): + +1. **An expired or invalid token.** Re-authenticate. For direct API access, use a valid Personal Access Token (PAT). +2. **The user lacks permission** for the action. Check the user's role in the dashboard. +3. **Management can't validate the token.** Review `docker compose logs management` for token-validation errors. + +**Confirm**: Re-run the action. It succeeds, and the Management logs show no auth errors. diff --git a/src/pages/selfhosted/troubleshooting/database.mdx b/src/pages/selfhosted/troubleshooting/database.mdx new file mode 100644 index 00000000..bc223b22 --- /dev/null +++ b/src/pages/selfhosted/troubleshooting/database.mdx @@ -0,0 +1,30 @@ +export const description = + "Self-hosted NetBird database troubleshooting: the Management service failing to start after an upgrade, and recovery after power loss." + +# Database issues + +Database and migration problems on a self-hosted deployment. For other areas, start from [Troubleshooting](/selfhosted/troubleshooting). + +## Management service won't start after upgrade + +**Symptom**: After an upgrade, the Management container fails to start or crash-loops. + +**Likely causes and fixes** (most common first): + +1. **A failed schema migration.** Check `docker compose logs management` for migration errors. This is the usual cause and the logs name the failing step. +2. **A skipped intermediate version.** Major version jumps may require stepping through intermediate releases. Follow the [upgrade path](/selfhosted/maintenance/upgrade) for your version. +3. **A corrupt or partially-migrated database.** If the migration can't be completed, restore from a pre-upgrade backup and retry the upgrade. + +**Confirm**: `docker compose ps` shows `management` as `Up`, and its logs end with the service listening rather than a migration error. + +## Data corruption after power loss + +**Symptom**: Services don't start cleanly after an unexpected shutdown. + +**Likely causes and fixes** (most common first): + +1. **A stale lock file.** Check for and remove a leftover database lock if the process that held it is gone. +2. **An interrupted write.** Review all service logs to find which store is failing. For SQLite, run an integrity check on the database file. +3. **An unrecoverable file.** If the store can't be repaired, restore from the most recent backup. + +**Confirm**: All services report `Up` in `docker compose ps`, and peers reconnect with `netbird status` showing `Management: Connected`. diff --git a/src/pages/selfhosted/troubleshooting/identity-provider.mdx b/src/pages/selfhosted/troubleshooting/identity-provider.mdx new file mode 100644 index 00000000..efe266e7 --- /dev/null +++ b/src/pages/selfhosted/troubleshooting/identity-provider.mdx @@ -0,0 +1,80 @@ +export const description = + "Self-hosted NetBird embedded IdP and SSO troubleshooting: setup page access, redirect URIs, connectors, and user sync." + +# Embedded IdP issues + +Problems with the embedded identity provider (IdP) and SSO (single sign-on) on a self-hosted deployment. For other areas, start from [Troubleshooting](/selfhosted/troubleshooting). + +## Setup page not accessible + +**Symptom**: You can't open the `/setup` page to create the first user. + +**Likely causes and fixes** (most common first): + +1. **A user already exists.** `/setup` is only available when no users exist. If you've already created one, use the regular login page. +2. **The embedded IdP is disabled.** Confirm it is enabled in your configuration. +3. **Management isn't running.** Check `docker compose logs management`. + +**Confirm**: `/setup` loads (first run), or the regular login page loads (setup already done). + +## "Setup already completed" error (HTTP 412) + +**Symptom**: The setup endpoint returns a 412 error. + +**Cause**: Setup has already completed; the first user exists. + +**Fix**: Use the regular login page to sign in. + +## Password not working after user creation + +**Symptom**: You created a user but the password doesn't work. + +**Cause**: Passwords are shown only once at creation. If it wasn't saved, it can't be recovered. + +**Fix**: Recreate the user (delete and re-add, or create a new one with a new password via the API). For the owner account with no other admins, you may need to reset the database. + +**Confirm**: You can log in with the new credentials. + +## SSO connector not appearing on login page + +**Symptom**: You configured an identity provider but it doesn't show on the login page. + +**Likely causes and fixes** (most common first): + +1. **The connector wasn't saved.** Check **Settings → Identity Providers**. +2. **The redirect URI is wrong.** Ensure the IdP application has the exact redirect URI shown by NetBird. +3. **A configuration error.** Review `docker compose logs management` for connector errors. + +**Confirm**: The SSO button appears on the login page and starts the IdP flow. + +## "Invalid redirect URI" error from IdP + +**Symptom**: Clicking an SSO button returns an invalid redirect URI error from the IdP. + +**Cause**: The redirect URI registered at the IdP doesn't exactly match the one NetBird sends. + +**Fix**: Copy the exact redirect URL from NetBird (shown after saving the connector) into your IdP's allowed redirect URIs. Watch for trailing slashes, typos, and case sensitivity. + +**Confirm**: The SSO flow completes and lands you on the dashboard. + +## Identity Providers tab not visible + +**Symptom**: The Identity Providers tab is missing from Settings. + +**Cause**: The tab only appears when the embedded IdP is enabled. + +**Fix**: Check your deployment: quickstart enables it by default, the combined setup always has it on, and older multi-container deployments need `EmbeddedIdP.Enabled` set to `true` in `management.json`. + +**Confirm**: The Identity Providers tab appears under Settings. + +## Users not syncing from SSO provider + +**Symptom**: Users who authenticate via SSO don't appear in the user list. + +**Likely causes and fixes** (most common first): + +1. **They haven't logged in yet.** Users appear after their first successful login, not when the connector is configured. +2. **The SSO flow isn't completing.** Confirm the user lands on the dashboard after authenticating. +3. **Token validation is failing.** Review the Management logs for token errors. + +**Confirm**: After a successful SSO login, the user appears in the dashboard user list. diff --git a/src/pages/selfhosted/troubleshooting/installation.mdx b/src/pages/selfhosted/troubleshooting/installation.mdx new file mode 100644 index 00000000..96176560 --- /dev/null +++ b/src/pages/selfhosted/troubleshooting/installation.mdx @@ -0,0 +1,71 @@ +export const description = + "Self-hosted NetBird installation script issues: the readiness wait, DNS, certificates, Traefik, and re-running cleanly." + +# Installation script issues + +Problems running the self-hosted `getting-started.sh` script. For other areas, start from [Troubleshooting](/selfhosted/troubleshooting). + +## Script hangs on "Waiting for NetBird server to become ready" + +**Symptom**: The `getting-started.sh` script stays on the wait line for several minutes, even though netbird-server looks healthy in `docker ps`. + +The wait check probes the OIDC endpoint through your reverse proxy, so a healthy server alone isn't enough. Something on the path from the public internet to netbird-server is broken. Check these likely causes in order, most common first: + +**1. DNS doesn't point at the host** + +```bash +dig +short netbird.example.com +``` + +The result should be the public IP of the VM running NetBird. If it's empty or wrong, the cert challenge can't complete and the probe never succeeds. + +**2. The certificate didn't issue** + +For options 0 (built-in Traefik) and 1 (existing Traefik), check that a real cert was issued: + +```bash +curl -vI https://netbird.example.com 2>&1 | grep -E "subject|issuer" +``` + +A self-signed or default cert means ACME hasn't completed. Check Traefik logs for ACME errors. Common causes: port 80 not reachable from the internet, DNS not propagated yet, or a Let's Encrypt rate limit. + +**3. Traefik can't read the Docker socket (option 1 only)** + +Traefik discovers NetBird containers via Docker labels. If the socket is mounted but unreadable (old SDK, wrong path, permission denied), routes never get created. Check Traefik logs for `client version is too old` or `permission denied while trying to connect to the Docker daemon socket`. + +**4. Traefik and NetBird aren't on the same network (option 1 only)** + +```bash +docker network inspect | grep -A 2 Containers +``` + +You should see both Traefik and the NetBird containers listed. + +**5. Routing fails end to end** + +If the four above look fine, test the full path manually: + +```bash +curl -k https://netbird.example.com/oauth2/.well-known/openid-configuration +``` + +A valid response is JSON containing `"issuer"`. Anything else points to where to dig: 502 means Traefik can't reach netbird-server, 404 means the labels aren't matched, connection refused means you're not hitting Traefik at all. + +**Confirm**: The probe succeeds and the script continues on its own. You can leave it waiting while you debug. To start over instead, stop it with Ctrl+C, run `docker compose down -v`, fix the issue, and re-run. + +## Script fails on existing installation check + +**Symptom**: The script exits immediately with a message about generated files already existing. + +**Cause**: This is intentional, and protects a working setup from being overwritten. + +**Fix**: To start fresh (this removes all NetBird data, including users and peers): + +```bash +docker compose down --volumes +rm -f docker-compose.yml dashboard.env config.yaml proxy.env \ + traefik-dynamic.yaml nginx-netbird.conf caddyfile-netbird.txt \ + npm-advanced-config.txt +``` + +**Confirm**: Re-run the script; it proceeds past the existing-installation check and starts provisioning.