diff --git a/shard.lock b/shard.lock index 3a202455..1d1d0c8c 100644 --- a/shard.lock +++ b/shard.lock @@ -143,7 +143,7 @@ shards: neuroplastic: git: https://github.com/spider-gazelle/neuroplastic.git - version: 1.13.1 + version: 1.14.2 office365: git: https://github.com/placeos/office365.git @@ -179,7 +179,7 @@ shards: placeos-driver: git: https://github.com/placeos/driver.git - version: 7.19.0 + version: 7.21.1 placeos-log-backend: git: https://github.com/place-labs/log-backend.git @@ -187,7 +187,7 @@ shards: placeos-models: git: https://github.com/placeos/models.git - version: 9.85.0 + version: 9.86.2 placeos-resource: git: https://github.com/place-labs/resource.git @@ -207,11 +207,11 @@ shards: redis-cluster: git: https://github.com/place-labs/redis-cluster.cr.git - version: 0.8.10 + version: 0.8.11 redis_service_manager: git: https://github.com/place-labs/redis_service_manager.git - version: 3.3.0 + version: 3.3.1 rendezvous-hash: git: https://github.com/caspiano/rendezvous-hash.git diff --git a/spec/api/chaos_spec.cr b/spec/api/chaos_spec.cr index b9e3b97b..46d2a69e 100644 --- a/spec/api/chaos_spec.cr +++ b/spec/api/chaos_spec.cr @@ -1,12 +1,43 @@ require "../helper" -# Testing pattern for controllers as follows.. -# - Create the controller instance, with intended mocks -# - Create the call instance -# - Check the outcome of the request - module PlaceOS::Core describe Api::Chaos, tags: "api" do - pending "chaos/terminate" + client = AC::SpecHelper.client + namespace = Api::Chaos::NAMESPACE[0] + + after_each do + Services.reset + end + + it "chaos/terminate" do + ProcessManager.with_driver do |mod, _driver_path, driver_key, _driver| + module_manager = module_manager_mock + Services.module_manager = module_manager + + module_manager.load_module(mod) + pid = module_manager.local_processes.protocol_manager_by_driver?(driver_key).try(&.pid).not_nil! + Process.exists?(pid).should be_true + + response = client.post("#{namespace}terminate?path=#{driver_key}") + response.status_code.should eq 200 + + success = Channel(Nil).new + spawn do + while Process.exists?(pid) + sleep 50.milliseconds + end + success.send nil + end + + select + when success.receive + Process.exists?(pid).should be_false + when timeout 2.seconds + raise "timeout waiting for driver terminate" + end + ensure + module_manager.try &.stop + end + end end end diff --git a/spec/api/command_spec.cr b/spec/api/command_spec.cr index 64a98a58..0bae5367 100644 --- a/spec/api/command_spec.cr +++ b/spec/api/command_spec.cr @@ -11,12 +11,6 @@ module PlaceOS::Core::Api used_for_place_testing: [] of String, }.to_json - # allow injecting mock manager during testing - class Command - class_property mock_module_manager : ModuleManager? = nil - property module_manager : ModuleManager { @@mock_module_manager || ModuleManager.instance } - end - describe Command, tags: "api" do client = AC::SpecHelper.client @@ -25,7 +19,7 @@ module PlaceOS::Core::Api "Content-Type" => "application/json", } - after_each { Command.mock_module_manager = nil } + after_each { Services.reset } describe "command/:module_id/execute" do it "executes a command on a running module" do @@ -33,7 +27,8 @@ module PlaceOS::Core::Api mod_id = mod.id.as(String) module_manager = module_manager_mock module_manager.load_module(mod) - Command.mock_module_manager = module_manager + Services.module_manager = module_manager + Services.resource_manager = resource_manager route = File.join(namespace, mod_id, "execute") response = client.post(route, headers: json_headers, body: EXEC_PAYLOAD) @@ -57,7 +52,8 @@ module PlaceOS::Core::Api module_manager = module_manager_mock # Register as lazy (don't spawn driver) module_manager.load_module(mod) - Command.mock_module_manager = module_manager + Services.module_manager = module_manager + Services.resource_manager = resource_manager # Verify driver is not spawned module_manager.local_processes.module_loaded?(mod_id).should be_false @@ -80,7 +76,7 @@ module PlaceOS::Core::Api # Don't load the module, but it's not lazy either module_manager = module_manager_mock - Command.mock_module_manager = module_manager + Services.module_manager = module_manager route = File.join(namespace, mod_id, "execute") response = client.post(route, headers: json_headers, body: EXEC_PAYLOAD) @@ -100,7 +96,8 @@ module PlaceOS::Core::Api # Load module module_manager.load_module(mod) - Command.mock_module_manager = module_manager + Services.module_manager = module_manager + Services.resource_manager = resource_manager # Create Command controller context route = File.join(namespace, mod_id, "debugger") @@ -117,7 +114,7 @@ module PlaceOS::Core::Api message_channel.close raise e end - Fiber.yield + sleep 100.milliseconds # Create an execute request route = File.join(namespace, mod_id, "execute") @@ -126,12 +123,14 @@ module PlaceOS::Core::Api # Wait for messages on the debugger messages = [] of String - 2.times do + deadline = Time.instant + 5.seconds + until Time.instant >= deadline select when message = message_channel.receive messages << message + break if message == %([1,"this will be propagated to backoffice!"]) when timeout 2.seconds - break + next end end @@ -140,7 +139,5 @@ module PlaceOS::Core::Api resource_manager.try &.stop end end - - pending "command/debugger" end end diff --git a/spec/api/edge_spec.cr b/spec/api/edge_spec.cr new file mode 100644 index 00000000..be9d1a7d --- /dev/null +++ b/spec/api/edge_spec.cr @@ -0,0 +1,86 @@ +require "../helper" + +module PlaceOS::Core::Api + describe Edge, tags: "api" do + client = AC::SpecHelper.client + + namespace = Edge::NAMESPACE[0] + json_headers = HTTP::Headers{ + "Content-Type" => "application/json", + } + + it "returns desired state snapshots for an edge" do + _, _, mod = setup(role: PlaceOS::Model::Driver::Role::Service) + resource_manager = PlaceOS::Core::ResourceManager.new(testing: true) + resource_manager.start { } + edge = PlaceOS::Model::Generator.edge.save! + mod.edge_id = edge.id.as(String) + mod.running = true + mod.save! + + route = File.join(namespace, edge.id.as(String), "desired_state") + response = client.get(route, headers: json_headers) + response.status_code.should eq 200 + + snapshot = PlaceOS::Edge::State::Snapshot.from_json(response.body) + snapshot.edge_id.should eq edge.id + snapshot.modules.map(&.module_id).should contain(mod.id.as(String)) + snapshot.drivers.should_not be_empty + ensure + resource_manager.try &.stop + end + + it "returns not modified when the desired state is stale" do + _, _, mod = setup(role: PlaceOS::Model::Driver::Role::Service) + resource_manager = PlaceOS::Core::ResourceManager.new(testing: true) + resource_manager.start { } + edge = PlaceOS::Model::Generator.edge.save! + mod.edge_id = edge.id.as(String) + mod.running = true + mod.save! + + route = File.join(namespace, edge.id.as(String), "desired_state") + first = client.get(route, headers: json_headers) + first.status_code.should eq 200 + snapshot = PlaceOS::Edge::State::Snapshot.from_json(first.body) + + headers = json_headers.dup + headers["If-Modified-Since"] = HTTP.format_time(snapshot.last_modified) + second = client.get(route, headers: headers) + second.status_code.should eq 304 + ensure + resource_manager.try &.stop + end + + it "returns not found for an unknown edge snapshot request" do + response = client.get(File.join(namespace, "edge-missing", "desired_state"), headers: json_headers) + response.status_code.should eq 404 + end + + it "streams compiled driver binaries for an edge" do + _, driver, mod = setup(role: PlaceOS::Model::Driver::Role::Service) + resource_manager = PlaceOS::Core::ResourceManager.new(testing: true) + resource_manager.start { } + edge = PlaceOS::Model::Generator.edge.save! + mod.edge_id = edge.id.as(String) + mod.running = true + mod.save! + + result = PlaceOS::Core::DriverResource.load(driver, PlaceOS::Core::DriverStore.new, true) + route = File.join(namespace, edge.id.as(String), "drivers", File.basename(result.path)) + response = client.get(route, headers: json_headers) + response.status_code.should eq 200 + response.headers["Content-Type"].should eq "application/octet-stream" + response.body.bytesize.should be > 0 + ensure + resource_manager.try &.stop + end + + it "returns not found when the binary key does not exist" do + edge = PlaceOS::Model::Generator.edge.save! + route = File.join(namespace, edge.id.as(String), "drivers", "drivers_missing_deadbeef_arm64") + response = client.get(route, headers: json_headers) + response.status_code.should eq 404 + end + end +end diff --git a/spec/api/status_spec.cr b/spec/api/status_spec.cr index 95d1dadc..a87af8a1 100644 --- a/spec/api/status_spec.cr +++ b/spec/api/status_spec.cr @@ -9,9 +9,14 @@ module PlaceOS::Core::Api "Content-Type" => "application/json", } + after_each do + Services.reset + end + describe "status/" do it "renders data about node" do _, driver, _, resource_manager = create_resources + Services.resource_manager = resource_manager driver.reload! @@ -27,10 +32,81 @@ module PlaceOS::Core::Api resource_manager.try &.stop end - pending "deletes standalone driver binary used for metadata" - end + it "returns local driver status for a running module" do + _, driver, mod, resource_manager = create_resources + module_manager = module_manager_mock + Services.module_manager = module_manager + Services.resource_manager = resource_manager + module_manager.load_module(mod) + + driver_path = module_manager.store.driver_binary_path(driver.file_name, driver.commit).to_s + route = "#{namespace}driver?path=#{URI.encode_path(driver_path)}" + response = client.get(route, headers: json_headers) + response.status_code.should eq 200 + + status = Status::DriverStatus.from_json(response.body) + status.local.should_not be_nil + status.local.not_nil!.running.should be_true + status.edge.should be_empty + ensure + module_manager.try &.stop + resource_manager.try &.stop + end + + it "returns machine load for local and edge runtimes" do + _, _, _, resource_manager = create_resources + Services.resource_manager = resource_manager + response = client.get("#{namespace}load", headers: json_headers) + response.status_code.should eq 200 + + load = Status::MachineLoad.from_json(response.body) + load.local.hostname.should_not be_empty + load.local.cpu_count.should be > 0 + load.edge.should be_empty + ensure + resource_manager.try &.stop + end + + it "returns loaded module mappings" do + _, _, mod, resource_manager = create_resources + module_manager = module_manager_mock + Services.module_manager = module_manager + Services.resource_manager = resource_manager + module_manager.load_module(mod) + + response = client.get("#{namespace}loaded", headers: json_headers) + response.status_code.should eq 200 + + loaded = Status::LoadedModules.from_json(response.body) + loaded.local.values.flatten.should contain(mod.id.as(String)) + loaded.edge.should be_empty + ensure + module_manager.try &.stop + resource_manager.try &.stop + end + + it "reports persisted edge connection visibility" do + edge = PlaceOS::Model::Generator.edge.save! + edge.update_fields( + online: true, + last_seen: Time.utc + ) + + module_manager = module_manager_mock + Services.module_manager = module_manager + response = client.get("#{namespace}edges", headers: json_headers) + response.status_code.should eq 200 - pending "status/driver" - pending "status/load" + body = Hash(String, Status::EdgeConnection).from_json(response.body) + body[edge.id.as(String)].online.should be_true + body[edge.id.as(String)].last_seen.should_not be_nil + body[edge.id.as(String)].websocket_connected.should be_false + body[edge.id.as(String)].snapshot_version.should be_nil + body[edge.id.as(String)].pending_updates.should eq 0 + body[edge.id.as(String)].pending_events.should eq 0 + ensure + module_manager.try &.stop + end + end end end diff --git a/spec/driver_manager/driver_cleanup_spec.cr b/spec/driver_manager/driver_cleanup_spec.cr index c2ccb768..a9501d25 100644 --- a/spec/driver_manager/driver_cleanup_spec.cr +++ b/spec/driver_manager/driver_cleanup_spec.cr @@ -24,9 +24,22 @@ module PlaceOS::Core tracker = DriverCleanup::StaleProcessTracker.new(DriverStore::BINARY_PATH, REDIS_CLIENT) stale_list = tracker.update_and_find_stale(ENV["STALE_THRESHOLD_DAYS"]?.try &.to_i || 30) stale_list.size.should eq(0) - driver_file = Path[DriverStore::BINARY_PATH, "drivers_place_private_helper_cce023a_#{Core::ARCH}"].to_s - value = REDIS_CLIENT.hgetall(driver_file) + value = case data = REDIS_CLIENT.hgetall(driver_path) + in Hash + data.transform_keys(&.to_s).transform_values(&.to_s) + in Array + hash = {} of String => String + data.each_slice(2) do |slice| + next unless field = slice[0]? + next unless raw = slice[1]? + hash[field.to_s] = raw.to_s + end + hash + end value["last_executed_at"].to_i64.should be > 0 + ensure + module_manager.try &.stop + resource_manager.try &.stop end end end diff --git a/spec/helper.cr b/spec/helper.cr index 5b4b43dc..5519408b 100644 --- a/spec/helper.cr +++ b/spec/helper.cr @@ -13,8 +13,9 @@ require "placeos-models/spec/generator" require "spec" -SPEC_DRIVER = "drivers/place/private_helper.cr" -CORE_URL = ENV["CORE_URL"]? || "http://core:3000" +SPEC_DRIVER = "drivers/place/private_helper.cr" +SPEC_DRIVER_COMMIT = ENV["SPEC_DRIVER_COMMIT"]? || "d2b97745373084441f6035deca6301a633f9edaf" +CORE_URL = ENV["CORE_URL"]? || "http://core:3000" # To reduce the run-time of the very setup heavy specs. # - Use teardown if you need to clear a temporary repository @@ -27,6 +28,24 @@ def random_id end def clear_tables + begin + PlaceOS::Core::Services.current_resource_manager?.try &.stop + PlaceOS::Core::ResourceManager.current_instance?.try &.stop + rescue + end + + begin + if module_manager = PlaceOS::Core::Services.current_module_manager? || PlaceOS::Core::ModuleManager.current_instance? + module_manager.local_processes.shutdown + module_manager.stop + end + rescue + end + + PlaceOS::Core::ResourceManager.reset_instance + PlaceOS::Core::ModuleManager.reset_instance + PlaceOS::Core::Services.reset + PlaceOS::Model::ControlSystem.clear PlaceOS::Model::Repository.clear PlaceOS::Model::Driver.clear @@ -69,10 +88,14 @@ Spec.before_suite do end Spec.after_suite do - PlaceOS::Core::ResourceManager.instance.stop + PlaceOS::Core::Services.current_resource_manager?.try &.stop + PlaceOS::Core::ResourceManager.current_instance?.try &.stop + PlaceOS::Core::Services.current_module_manager?.try &.stop + PlaceOS::Core::ModuleManager.current_instance?.try &.stop + PlaceOS::Core::ResourceManager.reset_instance + PlaceOS::Core::ModuleManager.reset_instance + PlaceOS::Core::Services.reset Log.builder.bind("*", backend: PlaceOS::LogBackend.log_backend, level: :error) - puts "\n> Terminating stray driver processes" - `pkill -f ".*core-spec.*"` rescue nil end # Create models for a test @@ -87,11 +110,7 @@ def setup(role : PlaceOS::Model::Driver::Role? = nil, use_head : Bool = false) driver_file_name = "drivers/place/private_helper.cr" driver_module_name = "PrivateHelper" driver_name = "spec_helper" - driver_commit = if use_head - "HEAD" - else - GitRepository.new(repository_uri).commits("master", depth: 1).first.hash - end + driver_commit = use_head ? "HEAD" : SPEC_DRIVER_COMMIT driver_role = role || PlaceOS::Model::Driver::Role::Logic existing_repo = PlaceOS::Model::Repository.where(uri: repository_uri).first? @@ -103,6 +122,14 @@ def setup(role : PlaceOS::Model::Driver::Role? = nil, use_head : Bool = false) if existing_repo && existing_driver && existing_module && !needs_control_system && right_driver_role repository, driver, mod = existing_repo, existing_driver, existing_module + # Tests reuse the same models heavily, so normalize mutable runtime-ish + # state that other specs may have changed. + if mod.edge_id.presence || mod.launch_on_execute || !mod.running + mod.edge_id = nil + mod.launch_on_execute = false + mod.running = true + mod.save! + end else clear_tables @@ -176,23 +203,19 @@ class MockClustering < Clustering ready_cb = Proc(Nil).new do cluster_stable_callbacks.each do |callback| - spawn do - begin - callback.call - rescue error - Log.error(exception: error) { "notifying cluster stable" } - end + begin + callback.call + rescue error + Log.error(exception: error) { "notifying cluster stable" } end end end rebalance_callbacks.each do |callback| - spawn do - begin - callback.call(rendezvous_hash, ready_cb) - rescue error - Log.error(exception: error) { "performing rebalance callback" } - end + begin + callback.call(rendezvous_hash, ready_cb) + rescue error + Log.error(exception: error) { "performing rebalance callback" } end end true diff --git a/spec/mappings/control_system_modules_spec.cr b/spec/mappings/control_system_modules_spec.cr index 15845d44..12cbef86 100644 --- a/spec/mappings/control_system_modules_spec.cr +++ b/spec/mappings/control_system_modules_spec.cr @@ -19,6 +19,14 @@ module PlaceOS::Core::Mappings end describe ControlSystemModules, tags: "mappings" do + Spec.before_each do + ::Log.builder.bind("place_os.resource", backend: PlaceOS::LogBackend.log_backend, level: :fatal) + end + + Spec.after_each do + ::Log.builder.bind("place_os.resource", backend: PlaceOS::LogBackend.log_backend, level: :error) + end + describe ".update_mapping" do it "ignores systems not mapped to node" do control_system = Model::Generator.control_system diff --git a/spec/placeos-edge/client_spec.cr b/spec/placeos-edge/client_spec.cr index 62b14b80..d3a25d36 100644 --- a/spec/placeos-edge/client_spec.cr +++ b/spec/placeos-edge/client_spec.cr @@ -1,51 +1,311 @@ require "./helper" +require "../processes/support" +require "file_utils" module PlaceOS::Edge + class Client + def __test_load_persisted_snapshot + load_persisted_snapshot + end + + def __test_connect_sync_count + @connect_sync_count.get + end + end + describe Client, tags: ["api", "edge"] do - it "handshakes on register" do - coordination = Channel(Bool).new + it "responds to execute requests over the realtime websocket without a register handshake" do + PlaceOS::Core::ProcessManager.with_driver do |mod, driver_path, driver_key, _driver| + client = Client.new(skip_handshake: true, ping: false) + client_ws, server_ws = mock_sockets + begin + module_id = mod.id.as(String) + client.runtime_manager.load(module_id, driver_key) + client.runtime_manager.start(module_id, PlaceOS::Core::ModuleManager.start_payload(mod)) - client = Client.new - client_ws, server_ws = mock_sockets + response_channel = Channel(Protocol::Text).new + + server_ws.on_message do |message| + parsed = Protocol::Text.from_json(message) + case parsed.body + when Protocol::Message::ProxyRedis, Protocol::Message::RuntimeEvent, Protocol::Message::Heartbeat + server_ws.send(Protocol::Text.new(parsed.sequence_id, Protocol::Message::Success.new(true)).to_json) + else + response_channel.send parsed + end + end + + spawn do + client.connect(client_ws) + rescue IO::Error | Channel::ClosedError + nil + end + spawn do + server_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + Fiber.yield + + request = Protocol::Text.new( + 42_u64, + Protocol::Message::Execute.new( + module_id: module_id, + payload: PlaceOS::Core::ModuleManager.execute_payload(:used_for_place_testing), + user_id: nil + ) + ) + + server_ws.send(request.to_json) + + deadline = Time.instant + 2.seconds + loop do + raise "timed out waiting for execute response" if Time.instant >= deadline - spawn { - client.connect(client_ws) do - coordination.send(true) + select + when response = response_channel.receive + next unless response.sequence_id == 42_u64 + response.body.should be_a(Protocol::Message::ExecuteResponse) + + body = response.body.as(Protocol::Message::ExecuteResponse) + body.success.should be_true + body.output.should eq %("you can delete this file") + body.code.should eq 200 + break + when timeout 50.milliseconds + end + end + ensure + client.runtime_manager.kill(driver_key) rescue nil + client.disconnect + client_ws.close rescue nil + server_ws.close rescue nil end - } + end + end + + it "forwards debug messages and stops after ignore over the realtime websocket" do + PlaceOS::Core::ProcessManager.with_driver do |mod, _driver_path, driver_key, _driver| + client = Client.new(skip_handshake: true, ping: false) + client_ws, server_ws = mock_sockets + begin + module_id = mod.id.as(String) + client.runtime_manager.load(module_id, driver_key) + client.runtime_manager.start(module_id, PlaceOS::Core::ModuleManager.start_payload(mod)) + + received = Channel(Protocol::Text).new + + server_ws.on_message do |message| + parsed = Protocol::Text.from_json(message) + + body = parsed.body + case body + when Protocol::Message::Debug + server_ws.send(Protocol::Text.new(parsed.sequence_id, Protocol::Message::Success.new(true)).to_json) + received.send(parsed) + when Protocol::Message::Ignore + server_ws.send(Protocol::Text.new(parsed.sequence_id, Protocol::Message::Success.new(true)).to_json) + received.send(parsed) + when Protocol::Message::ProxyRedis, Protocol::Message::RuntimeEvent, Protocol::Message::Heartbeat + server_ws.send(Protocol::Text.new(parsed.sequence_id, Protocol::Message::Success.new(true)).to_json) + else + received.send(parsed) + end + end + + spawn do + client.connect(client_ws) + rescue IO::Error | Channel::ClosedError + nil + end + spawn do + server_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + Fiber.yield + + server_ws.send(Protocol::Text.new(1_u64, Protocol::Message::Debug.new(module_id)).to_json) + + select + when response = received.receive + response.body.should be_a(Protocol::Message::Success) + response.sequence_id.should eq 1_u64 + when timeout 2.seconds + raise "timed out waiting for debug subscription" + end - messages = Channel(Protocol::Text).new + # Drain initial status chatter before triggering the explicit debug output. + loop do + select + when message = received.receive + break if message.body.is_a?(Protocol::Message::DebugMessage) + when timeout 200.milliseconds + break + end + end - server_ws.on_message do |m| - messages.send Protocol::Text.from_json(m) - server_ws.send(Protocol::Text.new(0_u64, Protocol::Message::RegisterResponse.new(true)).to_json) + result, code = client.runtime_manager.execute( + module_id, + PlaceOS::Core::ModuleManager.execute_payload(:echo, ["hello"]), + user_id: nil + ) + result.should eq %("hello") + code.should eq 200 + + messages = [] of String + 6.times do + select + when message = received.receive + next unless body = message.body.as?(Protocol::Message::DebugMessage) + messages << body.message + break if body.message == %([1,"hello"]) + when timeout 2.seconds + break + end + end + messages.should contain %([1,"hello"]) + + server_ws.send(Protocol::Text.new(2_u64, Protocol::Message::Ignore.new(module_id)).to_json) + + select + when response = received.receive + response.body.should be_a(Protocol::Message::Success) + response.sequence_id.should eq 2_u64 + when timeout 2.seconds + raise "timed out waiting for ignore request" + end + + client.runtime_manager.execute( + module_id, + PlaceOS::Core::ModuleManager.execute_payload(:echo, ["hello"]), + user_id: nil + ) + + expect_raises(Exception) do + loop do + select + when message = received.receive + if body = message.body.as?(Protocol::Message::DebugMessage) + raise "unexpected debug message after ignore: #{body.message}" + end + when timeout 500.milliseconds + raise "timeout" + end + end + end + ensure + client.runtime_manager.kill(driver_key) rescue nil + client.disconnect + client_ws.close rescue nil + server_ws.close rescue nil + end end + end + + it "flushes queued redis updates, runtime events, and heartbeat state when the websocket connects" do + client = nil.as(Client?) + client_ws, server_ws = mock_sockets + dir = File.join(Dir.tempdir, "edge-client-store-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + client = Client.new(skip_handshake: true, ping: false, sync_injected_socket: true, runtime_store: store) - spawn { server_ws.run } + store.save_snapshot( + State::Snapshot.new( + edge_id: "edge-test", + version: "snapshot-42", + last_modified: Time.utc, + drivers: [] of State::DesiredDriver, + modules: [] of State::DesiredModule + ) + ) + queued_update = store.queue_update(Protocol::RedisAction::HSET, "status/mod-1", "power", "on") + queued_event = store.queue_event(State::RuntimeEvent.new(:sync_status, message: "offline", snapshot_version: "snapshot-42", backlog_depth: 1)) - Fiber.yield + received = Channel(Protocol::Text).new - select - when message = messages.receive - when timeout 2.seconds - raise "timed out" + server_ws.on_message do |message| + parsed = Protocol::Text.from_json(message) + received.send(parsed) + + case body = parsed.body + when Protocol::Message::ProxyRedis, Protocol::Message::RuntimeEvent, Protocol::Message::Heartbeat + server_ws.send(Protocol::Text.new(parsed.sequence_id, Protocol::Message::Success.new(true)).to_json) + end + end + + spawn do + client.connect(client_ws) + rescue IO::Error | Channel::ClosedError + nil + end + run_mock_socket(server_ws) + Fiber.yield + + seen_types = [] of Protocol::Message::Body::Type + deadline = Time.instant + 2.seconds + until seen_types.includes?(Protocol::Message::Body::Type::ProxyRedis) && + seen_types.includes?(Protocol::Message::Body::Type::RuntimeEvent) && + seen_types.includes?(Protocol::Message::Body::Type::Heartbeat) + raise "timed out waiting for queued sync traffic flush" if Time.instant >= deadline + + select + when message = received.receive + seen_types << message.body.type + when timeout 50.milliseconds + end + end + + store.pending_updates.any?(&.id.==(queued_update.id)).should be_false + store.pending_events.any?(&.id.==(queued_event.id)).should be_false + + deadline = Time.instant + 2.seconds + until client.__test_connect_sync_count > 0 + raise "timed out waiting for connect sync completion" if Time.instant >= deadline + sleep 10.milliseconds + end + ensure + client.try &.disconnect + client_ws.close rescue nil + server_ws.close rescue nil + FileUtils.rm_rf(dir) end + end - message.should_not be_nil - message.body.should be_a(Protocol::Message::Register) + it "boots runtime state from the persisted snapshot without websocket orchestration" do + PlaceOS::Core::ProcessManager.with_driver do |mod, _driver_path, driver_key, _driver| + client = nil.as(Client?) + dir = File.join(Dir.tempdir, "edge-client-store-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + client = Client.new(skip_handshake: true, ping: false, runtime_store: store) - body = message.body.as(Protocol::Message::Register) + snapshot = State::Snapshot.new( + edge_id: "edge-test", + version: "persisted-v1", + last_modified: Time.utc, + drivers: [State::DesiredDriver.new(driver_key)], + modules: [State::DesiredModule.new( + mod.id.as(String), + driver_key, + true, + PlaceOS::Core::ModuleManager.start_payload(mod) + )] + ) + store.save_snapshot(snapshot) - # Message should say what's on the edge currently - # including modules and driver binaries - body.modules.should be_empty - body.drivers.should eq(client.drivers) + client.__test_load_persisted_snapshot - select - when result = coordination.receive - result.should be_true - when timeout 2.seconds - raise "timed out" + client.driver_loaded?(driver_key).should be_true + client.module_loaded?(mod.id.as(String)).should be_true + store.last_snapshot_version.should eq "persisted-v1" + ensure + client.try &.runtime_manager.kill(driver_key) + FileUtils.rm_rf(dir) + end end end end diff --git a/spec/placeos-edge/helper.cr b/spec/placeos-edge/helper.cr index 7e592aef..3866d7ad 100644 --- a/spec/placeos-edge/helper.cr +++ b/spec/placeos-edge/helper.cr @@ -8,3 +8,11 @@ def mock_sockets io_l, io_r = IO::Stapled.pipe ({HTTP::WebSocket.new(io_l), HTTP::WebSocket.new(io_r)}) end + +def run_mock_socket(ws : HTTP::WebSocket) + spawn do + ws.run + rescue IO::Error | Channel::ClosedError + nil + end +end diff --git a/spec/placeos-edge/offline_recovery_spec.cr b/spec/placeos-edge/offline_recovery_spec.cr new file mode 100644 index 00000000..1fddc6e9 --- /dev/null +++ b/spec/placeos-edge/offline_recovery_spec.cr @@ -0,0 +1,259 @@ +require "./helper" +require "file_utils" + +module PlaceOS::Edge + describe "Offline Recovery Scenarios", tags: ["edge"] do + it "recovers from extended offline period with large backlog" do + dir = File.join(Dir.tempdir, "edge-offline-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + + # Simulate edge going offline with active modules + snapshot = State::Snapshot.new( + edge_id: "edge-offline-test", + version: "v1", + last_modified: Time.utc, + drivers: [ + State::DesiredDriver.new("driver-meeting"), + State::DesiredDriver.new("driver-camera"), + ], + modules: [ + State::DesiredModule.new("mod-room-1", "driver-meeting", true, %({"ip":"192.168.1.100"})), + State::DesiredModule.new("mod-room-2", "driver-meeting", true, %({"ip":"192.168.1.101"})), + State::DesiredModule.new("mod-cam-1", "driver-camera", true, %({"ip":"192.168.1.200"})), + ] + ) + + store.save_snapshot(snapshot) + store.save_runtime_module("mod-room-1", State::RuntimeModule.new("driver-meeting", loaded: true, running: true)) + store.save_runtime_module("mod-room-2", State::RuntimeModule.new("driver-meeting", loaded: true, running: true)) + store.save_runtime_module("mod-cam-1", State::RuntimeModule.new("driver-camera", loaded: true, running: true)) + + # Simulate 500 redis updates while offline (realistic 1 hour offline scenario) + updates = [] of State::PendingRedisUpdate + 500.times do |i| + mod_id = "mod-room-#{(i % 2) + 1}" + key = ["power", "volume", "muted", "input_source"][i % 4] + value = ["true", "false", "50", "hdmi1"][i % 4] + updates << store.queue_update(Protocol::RedisAction::HSET, "status/#{mod_id}", key, value) + end + + # Simulate runtime events (different kinds to avoid full collapse) + events = [] of State::PendingRuntimeEvent + events << store.queue_event(State::RuntimeEvent.new(:driver_ready, driver_key: "driver-meeting")) + events << store.queue_event(State::RuntimeEvent.new(:driver_ready, driver_key: "driver-camera")) + events << store.queue_event(State::RuntimeEvent.new(:module_loaded, module_id: "mod-room-1", driver_key: "driver-meeting")) + events << store.queue_event(State::RuntimeEvent.new(:module_loaded, module_id: "mod-room-2", driver_key: "driver-meeting")) + events << store.queue_event(State::RuntimeEvent.new(:module_loaded, module_id: "mod-cam-1", driver_key: "driver-camera")) + events << store.queue_event(State::RuntimeEvent.new(:module_started, module_id: "mod-room-1", driver_key: "driver-meeting")) + events << store.queue_event(State::RuntimeEvent.new(:module_started, module_id: "mod-room-2", driver_key: "driver-meeting")) + events << store.queue_event(State::RuntimeEvent.new(:module_started, module_id: "mod-cam-1", driver_key: "driver-camera")) + events << store.queue_event(State::RuntimeEvent.new(:snapshot_applied, snapshot_version: "v1", backlog_depth: 500)) + # This will be collapsed with previous sync_status + events << store.queue_event(State::RuntimeEvent.new(:sync_status, message: "offline", snapshot_version: "v1", backlog_depth: 500)) + + store.flush + + # Verify state persisted + store.pending_updates.size.should be > 0 + store.pending_events.size.should be > 0 + + # Simulate edge restart (reload from disk) + reloaded = RuntimeStore.new(dir) + + # Verify all state recovered + reloaded.snapshot.not_nil!.version.should eq "v1" + reloaded.snapshot.not_nil!.modules.size.should eq 3 + reloaded.runtime_modules.size.should eq 3 + reloaded.runtime_modules["mod-room-1"].running.should be_true + reloaded.runtime_modules["mod-room-2"].running.should be_true + reloaded.runtime_modules["mod-cam-1"].running.should be_true + + # Verify pending items recovered (collapsed duplicates) + reloaded.pending_updates.size.should be < 500 # Collapsed + reloaded.pending_events.size.should be <= 10 # May have some collapsed + + # Simulate coming back online - acknowledge all updates + reloaded.pending_updates.each do |update| + reloaded.acknowledge_update(update.id) + end + + reloaded.pending_events.each do |event| + reloaded.acknowledge_event(event.id) + end + + # Verify queues cleared + reloaded.pending_updates.should be_empty + reloaded.pending_events.should be_empty + + # Verify compaction persisted + reloaded.flush + final = RuntimeStore.new(dir) + final.pending_updates.should be_empty + final.pending_events.should be_empty + ensure + FileUtils.rm_rf(dir) + end + end + + it "handles disk full scenario gracefully" do + dir = File.join(Dir.tempdir, "edge-diskfull-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + + snapshot = State::Snapshot.new( + edge_id: "edge-test", + version: "v1", + last_modified: Time.utc, + drivers: [State::DesiredDriver.new("driver-key")], + modules: [State::DesiredModule.new("mod-1", "driver-key", true, %({"ip":"192.168.1.1"}))] + ) + + store.save_snapshot(snapshot) + store.save_runtime_module("mod-1", State::RuntimeModule.new("driver-key", loaded: true, running: true)) + store.flush + + # Make directory read-only to simulate disk full + File.chmod(File.join(dir, "edge-state"), 0o555) + + # Should not crash when trying to write (graceful degradation) + # Edge continues in-memory only + update = store.queue_update(Protocol::RedisAction::HSET, "status/mod-1", "power", "on") + + # In-memory state should be updated + store.pending_updates.any?(&.id.==(update.id)).should be_true + + # But flush will fail silently (logged as warning) + store.flush # This will fail but shouldn't crash + + # Verify edge still operational in-memory + store.pending_updates.any?(&.id.==(update.id)).should be_true + + # Restore permissions + File.chmod(File.join(dir, "edge-state"), 0o755) + + # Should work again after permissions restored + update2 = store.queue_update(Protocol::RedisAction::HSET, "status/mod-1", "volume", "50") + store.flush + + reloaded = RuntimeStore.new(dir) + # First update was lost (couldn't write), second should be there + reloaded.pending_updates.any?(&.id.==(update2.id)).should be_true + ensure + File.chmod(File.join(dir, "edge-state"), 0o755) rescue nil + FileUtils.rm_rf(dir) + end + end + + it "handles rapid state changes with debouncing" do + dir = File.join(Dir.tempdir, "edge-debounce-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + + snapshot = State::Snapshot.new( + edge_id: "edge-test", + version: "v1", + last_modified: Time.utc, + drivers: [State::DesiredDriver.new("driver-key")], + modules: [State::DesiredModule.new("mod-1", "driver-key", true, %({"ip":"192.168.1.1"}))] + ) + + # Rapid updates (100 in quick succession) + 100.times do |i| + store.save_snapshot(snapshot) + store.save_runtime_module("mod-1", State::RuntimeModule.new("driver-key", loaded: true, running: (i % 2 == 0))) + end + + # Wait for debounce to settle + sleep 1.5.seconds + + # Should have written but not 100 times + reloaded = RuntimeStore.new(dir) + reloaded.snapshot.not_nil!.version.should eq "v1" + reloaded.runtime_modules["mod-1"].should_not be_nil + ensure + FileUtils.rm_rf(dir) + end + end + + it "handles backpressure limits correctly" do + dir = File.join(Dir.tempdir, "edge-backpressure-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + + # Queue slightly more than max updates to test limit + (Edge::MAX_PENDING_UPDATES + 100).times do |i| + store.queue_update(Protocol::RedisAction::HSET, "status/mod-#{i}", "key", "value") + end + + # Should be capped at max + store.pending_updates.size.should eq Edge::MAX_PENDING_UPDATES + + # Queue slightly more than max events to test limit + (Edge::MAX_PENDING_EVENTS + 50).times do |i| + store.queue_event(State::RuntimeEvent.new(:module_started, module_id: "mod-#{i}", driver_key: "driver")) + end + + # Should be capped at max + store.pending_events.size.should eq Edge::MAX_PENDING_EVENTS + + store.flush + + # Verify limits persisted + reloaded = RuntimeStore.new(dir) + reloaded.pending_updates.size.should eq Edge::MAX_PENDING_UPDATES + reloaded.pending_events.size.should eq Edge::MAX_PENDING_EVENTS + ensure + FileUtils.rm_rf(dir) + end + end + + it "handles corrupted state files gracefully" do + dir = File.join(Dir.tempdir, "edge-corrupt-#{UUID.random}") + begin + Dir.mkdir_p(dir) + Dir.mkdir_p(File.join(dir, "edge-state")) + + # Write corrupted core state + File.write(File.join(dir, "edge-state", "core.json"), "{ invalid json }") + + # Write corrupted pending updates (mix of valid and invalid) + File.write(File.join(dir, "edge-state", "pending-updates.jsonl"), <<-JSONL + {"id":"valid-1","action":"hset","hash_id":"status/mod-1","key_name":"power","status_value":"on"} + { invalid line } + {"id":"valid-2","action":"hset","hash_id":"status/mod-2","key_name":"volume","status_value":"50"} + JSONL + ) + + # Write corrupted pending events + File.write(File.join(dir, "edge-state", "pending-events.jsonl"), <<-JSONL + {"id":"event-1","event":{"timestamp":1234567890,"kind":"module_started","module_id":"mod-1","driver_key":"driver-key","message":null,"snapshot_version":null,"backlog_depth":null}} + not json at all + {"id":"event-2","event":{"timestamp":1234567891,"kind":"module_stopped","module_id":"mod-2","driver_key":"driver-key","message":null,"snapshot_version":null,"backlog_depth":null}} + JSONL + ) + + # Should load without crashing, skipping invalid lines + store = RuntimeStore.new(dir) + + # Core state should be empty (corrupted) + store.snapshot.should be_nil + + # Should have loaded valid lines only + store.pending_updates.size.should eq 2 + store.pending_updates.map(&.id).should contain("valid-1") + store.pending_updates.map(&.id).should contain("valid-2") + + store.pending_events.size.should eq 2 + store.pending_events.map(&.id).should contain("event-1") + store.pending_events.map(&.id).should contain("event-2") + ensure + FileUtils.rm_rf(dir) + end + end + end +end diff --git a/spec/placeos-edge/reconciler_spec.cr b/spec/placeos-edge/reconciler_spec.cr new file mode 100644 index 00000000..7e3506ab --- /dev/null +++ b/spec/placeos-edge/reconciler_spec.cr @@ -0,0 +1,129 @@ +require "./helper" +require "../processes/support" +require "file_utils" + +module PlaceOS::Edge + describe Reconciler, tags: ["edge"] do + it "loads and starts modules from the desired snapshot diff" do + PlaceOS::Core::ProcessManager.with_driver do |mod, _driver_path, driver_key, _driver| + dir = File.join(Dir.tempdir, "edge-reconciler-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + runtime = RuntimeManager.new + binary = BinaryManager.new("edge-123", PLACE_URI, CLIENT_SECRET) + reconciler = Reconciler.new(store, binary, runtime) + + snapshot = State::Snapshot.new( + edge_id: "edge-123", + version: "v1", + last_modified: Time.utc, + drivers: [State::DesiredDriver.new(driver_key)], + modules: [State::DesiredModule.new( + mod.id.as(String), + driver_key, + true, + PlaceOS::Core::ModuleManager.start_payload(mod) + )] + ) + + reconciler.apply(snapshot) + + runtime.driver_loaded?(driver_key).should be_true + runtime.module_loaded?(mod.id.as(String)).should be_true + runtime_store_module = store.runtime_modules[mod.id.as(String)] + runtime_store_module.loaded.should be_true + runtime_store_module.running.should be_true + ensure + FileUtils.rm_rf(dir) + end + end + end + + it "stops and unloads modules removed from the desired snapshot" do + PlaceOS::Core::ProcessManager.with_driver do |mod, _driver_path, driver_key, _driver| + dir = File.join(Dir.tempdir, "edge-reconciler-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + runtime = RuntimeManager.new + binary = BinaryManager.new("edge-123", PLACE_URI, CLIENT_SECRET) + reconciler = Reconciler.new(store, binary, runtime) + + initial = State::Snapshot.new( + edge_id: "edge-123", + version: "v1", + last_modified: Time.utc, + drivers: [State::DesiredDriver.new(driver_key)], + modules: [State::DesiredModule.new( + mod.id.as(String), + driver_key, + true, + PlaceOS::Core::ModuleManager.start_payload(mod) + )] + ) + reconciler.apply(initial) + + removed = State::Snapshot.new( + edge_id: "edge-123", + version: "v2", + last_modified: Time.utc + 1.second, + drivers: [] of State::DesiredDriver, + modules: [] of State::DesiredModule + ) + reconciler.apply(removed) + + runtime.module_loaded?(mod.id.as(String)).should be_false + store.runtime_modules.has_key?(mod.id.as(String)).should be_false + ensure + FileUtils.rm_rf(dir) + end + end + end + + it "restarts a running module when the desired payload changes" do + PlaceOS::Core::ProcessManager.with_driver do |mod, _driver_path, driver_key, _driver| + dir = File.join(Dir.tempdir, "edge-reconciler-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + runtime = RuntimeManager.new + binary = BinaryManager.new("edge-123", PLACE_URI, CLIENT_SECRET) + events = [] of State::RuntimeEvent + reconciler = Reconciler.new(store, binary, runtime, ->(event : State::RuntimeEvent) { events << event }) + + initial_payload = PlaceOS::Core::ModuleManager.start_payload(mod) + initial = State::Snapshot.new( + edge_id: "edge-123", + version: "v1", + last_modified: Time.utc, + drivers: [State::DesiredDriver.new(driver_key)], + modules: [State::DesiredModule.new(mod.id.as(String), driver_key, true, initial_payload)] + ) + reconciler.apply(initial) + + payload_hash = Hash(String, JSON::Any).from_json(initial_payload) + payload_hash["custom_name"] = JSON::Any.new("payload-updated") + updated_payload = payload_hash.to_json + updated = State::Snapshot.new( + edge_id: "edge-123", + version: "v2", + last_modified: Time.utc + 1.second, + drivers: [State::DesiredDriver.new(driver_key)], + modules: [State::DesiredModule.new(mod.id.as(String), driver_key, true, updated_payload)] + ) + reconciler.apply(updated) + + runtime_store_module = store.runtime_modules[mod.id.as(String)] + runtime_store_module.running.should be_true + runtime_store_module.payload.should eq updated_payload + + events.any? { |event| event.kind.module_stopped? && event.message == "payload changed" }.should be_true + (events.count(&.kind.module_started?) > 1).should be_true + ensure + FileUtils.rm_rf(dir) + end + end + end + end +end diff --git a/spec/placeos-edge/runtime_store_spec.cr b/spec/placeos-edge/runtime_store_spec.cr new file mode 100644 index 00000000..d5be96e0 --- /dev/null +++ b/spec/placeos-edge/runtime_store_spec.cr @@ -0,0 +1,127 @@ +require "./helper" +require "file_utils" + +module PlaceOS::Edge + describe RuntimeStore, tags: ["edge"] do + it "persists snapshots and queued redis updates" do + dir = File.join(Dir.tempdir, "edge-runtime-store-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + + snapshot = State::Snapshot.new( + edge_id: "edge-123", + version: "v1", + last_modified: Time.utc, + drivers: [State::DesiredDriver.new("driver-key")], + modules: [State::DesiredModule.new("mod-1", "driver-key", true, %({"name":"demo"}))] + ) + + store.save_snapshot(snapshot) + update = store.queue_update(Protocol::RedisAction::HSET, "status/mod-1", "power", "on") + pending_event = store.queue_event(State::RuntimeEvent.new(:sync_status, message: "offline", backlog_depth: 1)) + store.save_runtime_module("mod-1", State::RuntimeModule.new("driver-key", loaded: true, running: true, payload: "{}")) + + # Flush to ensure all writes complete before reloading + store.flush + + reloaded = RuntimeStore.new(dir) + reloaded.snapshot.not_nil!.version.should eq "v1" + reloaded.runtime_modules["mod-1"].running.should be_true + reloaded.pending_updates.first.id.should eq update.id + reloaded.pending_events.first.id.should eq pending_event.id + reloaded.last_error.should be_nil + + reloaded.acknowledge_update(update.id) + reloaded.acknowledge_event(pending_event.id) + reloaded.pending_updates.should be_empty + reloaded.pending_events.should be_empty + ensure + FileUtils.rm_rf(dir) + end + end + + it "collapses repeated latest-value redis updates and sync status events" do + dir = File.join(Dir.tempdir, "edge-runtime-store-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + + first = store.queue_update(Protocol::RedisAction::HSET, "status/mod-1", "power", "off") + second = store.queue_update(Protocol::RedisAction::HSET, "status/mod-1", "power", "on") + store.queue_update(Protocol::RedisAction::PUBLISH, "status/mod-1", "event", "hello") + + pending_updates = store.pending_updates + pending_updates.size.should eq 2 + pending_updates.any?(&.id.==(first.id)).should be_false + pending_updates.any?(&.id.==(second.id)).should be_true + pending_updates.find(&.action.publish?).not_nil!.status_value.should eq "hello" + + old_sync = store.queue_event(State::RuntimeEvent.new(:sync_status, message: "offline")) + new_sync = store.queue_event(State::RuntimeEvent.new(:sync_status, message: "online")) + store.queue_event(State::RuntimeEvent.new(:module_started, module_id: "mod-1", driver_key: "driver-key")) + + pending_events = store.pending_events + pending_events.size.should eq 2 + pending_events.any?(&.id.==(old_sync.id)).should be_false + pending_events.any?(&.id.==(new_sync.id)).should be_true + pending_events.find(&.event.kind.sync_status?).not_nil!.event.message.should eq "online" + ensure + FileUtils.rm_rf(dir) + end + end + + it "uses separate files for core state and pending items" do + dir = File.join(Dir.tempdir, "edge-runtime-store-#{UUID.random}") + begin + Dir.mkdir_p(dir) + store = RuntimeStore.new(dir) + + snapshot = State::Snapshot.new( + edge_id: "edge-123", + version: "v1", + last_modified: Time.utc, + drivers: [State::DesiredDriver.new("driver-key")], + modules: [State::DesiredModule.new("mod-1", "driver-key", true, %({"name":"demo"}))] + ) + + store.save_snapshot(snapshot) + store.queue_update(Protocol::RedisAction::HSET, "status/mod-1", "power", "on") + store.queue_update(Protocol::RedisAction::HSET, "status/mod-1", "volume", "50") + store.queue_event(State::RuntimeEvent.new(:module_started, module_id: "mod-1", driver_key: "driver-key")) + store.flush + + # Verify separate files exist + state_dir = File.join(dir, "edge-state") + Dir.exists?(state_dir).should be_true + File.exists?(File.join(state_dir, "core.json")).should be_true + File.exists?(File.join(state_dir, "pending-updates.jsonl")).should be_true + File.exists?(File.join(state_dir, "pending-events.jsonl")).should be_true + + # Verify core state file is small (doesn't contain pending items) + core_content = File.read(File.join(state_dir, "core.json")) + core_json = JSON.parse(core_content) + core_json["pending_updates"].as_a.should be_empty + core_json["pending_events"].as_a.should be_empty + core_json["snapshot"]["version"].as_s.should eq "v1" + + # Verify pending updates are in separate file (JSONL format) + updates_lines = File.read_lines(File.join(state_dir, "pending-updates.jsonl")) + updates_lines.size.should eq 2 + updates_lines.each do |line| + update = JSON.parse(line) + update["action"].as_s.should eq "hset" + update["hash_id"].as_s.should eq "status/mod-1" + end + + # Verify pending events are in separate file (JSONL format) + events_lines = File.read_lines(File.join(state_dir, "pending-events.jsonl")) + events_lines.size.should eq 1 + event_json = JSON.parse(events_lines.first) + event_json["event"]["kind"].as_s.should eq "module_started" + ensure + FileUtils.rm_rf(dir) + end + end + end +end diff --git a/spec/placeos-edge/server_spec.cr b/spec/placeos-edge/server_spec.cr index 4913a8dc..2f88b45e 100644 --- a/spec/placeos-edge/server_spec.cr +++ b/spec/placeos-edge/server_spec.cr @@ -2,5 +2,187 @@ require "./helper" module PlaceOS::Edge describe Server do + it "registers and exposes an edge manager while the socket is open" do + client_ws, server_ws = mock_sockets + edge = PlaceOS::Model::Generator.edge.save! + server = Server.new + + begin + server.manage_edge(edge.id.as(String), server_ws) + + manager = server.for?(edge.id.as(String)) + manager.should_not be_nil + manager.not_nil!.edge_id.should eq edge.id + + status = server.runtime_status[edge.id.as(String)]? + status.should_not be_nil + status.not_nil!.connected.should be_true + + edge.reload! + edge.online.should be_true + edge.last_seen.should_not be_nil + ensure + client_ws.close rescue nil + server_ws.close rescue nil + end + end + + it "removes the edge manager and marks the edge disconnected on socket close" do + client_ws, server_ws = mock_sockets + edge = PlaceOS::Model::Generator.edge.save! + server = Server.new + + begin + server.manage_edge(edge.id.as(String), server_ws) + server.for?(edge.id.as(String)).should_not be_nil + + run_mock_socket(server_ws) + run_mock_socket(client_ws) + + Fiber.yield + + client_ws.close + + deadline = Time.instant + 2.seconds + until server.for?(edge.id.as(String)).nil? + raise "timed out waiting for edge manager removal" if Time.instant >= deadline + sleep 10.milliseconds + end + + server.for?(edge.id.as(String)).should be_nil + + edge.reload! + edge.online.should be_false + + status = server.runtime_status[edge.id.as(String)]? + status.should be_nil + ensure + client_ws.close rescue nil + server_ws.close rescue nil + end + end + + it "updates runtime status from heartbeat messages" do + client_ws, server_ws = mock_sockets + edge = PlaceOS::Model::Generator.edge.save! + server = Server.new + + begin + server.manage_edge(edge.id.as(String), server_ws) + run_mock_socket(server_ws) + run_mock_socket(client_ws) + Fiber.yield + + heartbeat = Protocol::Text.new( + 11_u64, + Protocol::Message::Heartbeat.new( + Time.utc, + "snapshot-123", + 4, + 2 + ) + ) + client_ws.send(heartbeat.to_json) + + deadline = Time.instant + 2.seconds + loop do + status = server.runtime_status[edge.id.as(String)]? + break if status && status.snapshot_version == "snapshot-123" && status.pending_updates == 4 && status.pending_events == 2 && status.last_event == "heartbeat" + raise "timed out waiting for heartbeat status update" if Time.instant >= deadline + sleep 10.milliseconds + end + + status = server.runtime_status[edge.id.as(String)].not_nil! + status.connected.should be_true + status.snapshot_version.should eq "snapshot-123" + status.pending_updates.should eq 4 + status.pending_events.should eq 2 + status.last_event.should eq "heartbeat" + ensure + client_ws.close rescue nil + server_ws.close rescue nil + end + end + + it "updates runtime status from runtime events and records sync errors" do + client_ws, server_ws = mock_sockets + edge = PlaceOS::Model::Generator.edge.save! + server = Server.new + + begin + server.manage_edge(edge.id.as(String), server_ws) + run_mock_socket(server_ws) + run_mock_socket(client_ws) + Fiber.yield + + event = Protocol::Text.new( + 12_u64, + Protocol::Message::RuntimeEvent.new( + "sync_status", + nil, + nil, + "connection dropped", + "snapshot-err", + 7 + ) + ) + client_ws.send(event.to_json) + + deadline = Time.instant + 2.seconds + loop do + status = server.runtime_status[edge.id.as(String)]? + break if status && status.last_event == "sync_status" && status.last_error == "connection dropped" + raise "timed out waiting for runtime event status update" if Time.instant >= deadline + sleep 10.milliseconds + end + + status = server.runtime_status[edge.id.as(String)].not_nil! + status.connected.should be_true + status.snapshot_version.should eq "snapshot-err" + status.pending_updates.should eq 7 + status.last_event.should eq "sync_status" + status.last_error.should eq "connection dropped" + ensure + client_ws.close rescue nil + server_ws.close rescue nil + end + end + + it "keeps the replacement manager when an older connection closes later" do + client_ws_1, server_ws_1 = mock_sockets + client_ws_2, server_ws_2 = mock_sockets + edge = PlaceOS::Model::Generator.edge.save! + server = Server.new + + begin + server.manage_edge(edge.id.as(String), server_ws_1) + original = server.for?(edge.id.as(String)).not_nil! + + server.manage_edge(edge.id.as(String), server_ws_2) + replacement = server.for?(edge.id.as(String)).not_nil! + replacement.same?(original).should be_false + + run_mock_socket(server_ws_1) + run_mock_socket(client_ws_1) + run_mock_socket(server_ws_2) + run_mock_socket(client_ws_2) + Fiber.yield + + client_ws_1.close + sleep 100.milliseconds + + current = server.for?(edge.id.as(String)) + current.should_not be_nil + current.should eq replacement + + edge.reload! + edge.online.should be_true + ensure + client_ws_1.close rescue nil + server_ws_1.close rescue nil + client_ws_2.close rescue nil + server_ws_2.close rescue nil + end + end end end diff --git a/spec/placeos-edge/transport_spec.cr b/spec/placeos-edge/transport_spec.cr index 4ad897ae..e81bf916 100644 --- a/spec/placeos-edge/transport_spec.cr +++ b/spec/placeos-edge/transport_spec.cr @@ -1,6 +1,252 @@ require "./helper" module PlaceOS::Edge + private class TestTransport < Transport + def send_request_public(request : Protocol::Request) + send_request(request) + end + + def send_event_public(request : Protocol::Request) + send_event(request) + end + + def send_response_public(id : UInt64, response : Protocol::Client::Response | Protocol::Message::BinaryBody | Protocol::Message::Success) + send_response(id, response) + end + end + describe Transport do + it "routes requests to the on_request callback" do + client_ws, server_ws = mock_sockets + received = Channel(Tuple(UInt64, Protocol::Request)).new + + transport = TestTransport.new do |message| + received.send(message) + end + + server_ws.on_message do |message| + client_ws.send(message) + end + + begin + spawn do + transport.listen(client_ws) + client_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + + spawn do + server_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + + Fiber.yield + + server_ws.send(Protocol::Text.new(7_u64, Protocol::Message::Execute.new("mod-1", %({"ping":true}))).to_json) + + select + when message = received.receive + message[0].should eq 7_u64 + message[1].should be_a(Protocol::Message::Execute) + request = message[1].as(Protocol::Message::Execute) + request.module_id.should eq "mod-1" + request.payload.should eq %({"ping":true}) + when timeout 2.seconds + raise "timed out waiting for request callback" + end + ensure + transport.disconnect + client_ws.close rescue nil + server_ws.close rescue nil + end + end + + it "resolves responses for send_request" do + client_ws, server_ws = mock_sockets + + transport = TestTransport.new { |_| } + + server_ws.on_message do |message| + parsed = Protocol::Text.from_json(message) + request = parsed.body.as(Protocol::Message::Execute) + request.module_id.should eq "mod-2" + + response = Protocol::Text.new( + parsed.sequence_id, + Protocol::Message::ExecuteResponse.new(true, %("ok"), 200) + ) + server_ws.send(response.to_json) + end + + begin + spawn do + transport.listen(client_ws) + client_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + + spawn do + server_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + + Fiber.yield + + response = transport.send_request_public(Protocol::Message::Execute.new("mod-2", %({"value":1}))) + response.should be_a(Protocol::Message::ExecuteResponse) + + body = response.as(Protocol::Message::ExecuteResponse) + body.success.should be_true + body.output.should eq %("ok") + body.code.should eq 200 + ensure + transport.disconnect + client_ws.close rescue nil + server_ws.close rescue nil + end + end + + it "sends fire-and-forget events without waiting for a response" do + client_ws, server_ws = mock_sockets + received = Channel(Protocol::Text).new + + transport = TestTransport.new { |_| } + + server_ws.on_message do |message| + received.send(Protocol::Text.from_json(message)) + end + + begin + spawn do + transport.listen(client_ws) + client_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + + spawn do + server_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + + Fiber.yield + + transport.send_event_public(Protocol::Message::DebugMessage.new("mod-3", %([1,"hello"]))) + + select + when message = received.receive + message.body.should be_a(Protocol::Message::DebugMessage) + body = message.body.as(Protocol::Message::DebugMessage) + body.module_id.should eq "mod-3" + body.message.should eq %([1,"hello"]) + when timeout 2.seconds + raise "timed out waiting for event delivery" + end + ensure + transport.disconnect + client_ws.close rescue nil + server_ws.close rescue nil + end + end + + it "returns nil if a request is made after disconnect" do + transport = TestTransport.new { |_| } + + transport.disconnect + response = transport.send_request_public(Protocol::Message::Execute.new("mod-4", %({"value":2}))) + response.should be_nil + end + + it "releases a waiting request when disconnected before a response arrives" do + client_ws, server_ws = mock_sockets + transport = TestTransport.new { |_| } + result = Channel(Protocol::Response | Nil).new + + server_ws.on_message do |_message| + end + + begin + spawn do + transport.listen(client_ws) + client_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + + run_mock_socket(server_ws) + Fiber.yield + + spawn do + result.send(transport.send_request_public(Protocol::Message::Execute.new("mod-5", %({"value":3})))) + rescue IO::Error | Channel::ClosedError + result.send(nil) + end + + sleep 50.milliseconds + transport.disconnect + + select + when response = result.receive + response.should be_nil + when timeout 2.seconds + raise "timed out waiting for request to be released on disconnect" + end + ensure + transport.disconnect + client_ws.close rescue nil + server_ws.close rescue nil + end + end + + it "routes binary responses to the waiting request" do + client_ws, server_ws = mock_sockets + transport = TestTransport.new { |_| } + binary_path = File.join(Dir.tempdir, "edge-transport-binary-#{Random.rand(10_000)}") + File.write(binary_path, "binary-payload") + + server_ws.on_message do |message| + parsed = Protocol::Text.from_json(message) + parsed.body.should be_a(Protocol::Message::FetchBinary) + + binary = Protocol::Binary.new + binary.sequence_id = parsed.sequence_id + binary.status = Protocol::Binary::Status::Success + binary.key = "driver-key" + binary.binary = IO::Memory.new("binary-payload") + server_ws.stream(binary: true) do |io| + io.write(binary.to_slice) + end + end + + begin + spawn do + transport.listen(client_ws) + client_ws.run + rescue IO::Error | Channel::ClosedError + nil + end + + run_mock_socket(server_ws) + Fiber.yield + + response = transport.send_request_public(Protocol::Message::FetchBinary.new("driver-key")) + response.should be_a(Protocol::Message::BinaryBody) + + body = response.as(Protocol::Message::BinaryBody) + body.success.should be_true + body.key.should eq "driver-key" + body.io.gets_to_end.should eq "binary-payload" + ensure + File.delete(binary_path) rescue nil + transport.disconnect + client_ws.close rescue nil + server_ws.close rescue nil + end + end end end diff --git a/spec/processes/edge_spec.cr b/spec/processes/edge_spec.cr index 93381342..976515d3 100644 --- a/spec/processes/edge_spec.cr +++ b/spec/processes/edge_spec.cr @@ -1,5 +1,6 @@ require "../helper" -require "./local_spec" +require "./support" +require "../placeos-edge/helper" module PlaceOS::Core::ProcessManager record Context, @@ -17,165 +18,104 @@ module PlaceOS::Core::ProcessManager ) edge_manager = Edge.new(edge_id: edge_id, socket: server_ws) - spawn { server_ws.run } + spawn do + server_ws.run + rescue IO::Error | Channel::ClosedError + nil + end Fiber.yield - spawn { client.connect(client_ws) } + spawn do + client.connect(client_ws) + rescue IO::Error | Channel::ClosedError + nil + end Fiber.yield - {client, edge_manager} + {client, edge_manager, client_ws, server_ws} end def self.with_edge(&) with_driver do |mod, driver_path, driver_key, _driver| - if existing_edge_id = mod.edge_id - mod.running = false - mod.save! - edge = Model::Edge.find!(existing_edge_id) - else - edge = Model::Generator.edge.save! - mod.edge_id = edge.id.as(String) - mod.running = false - mod.save! - end - - ctx = Context.new( - module: mod, - edge: edge, - driver_path: driver_path, - driver_key: driver_key, - ) - - client, process_manager = client_server(edge.id.as(String)) - - yield ({ctx, client, process_manager}) - end - end - - describe Edge, tags: ["edge", "processes"] do - it "debug" do - with_edge do |ctx, _client, pm| - module_id = ctx.module.id.as(String) - pm.load(module_id: module_id, driver_key: ctx.driver_path) - pm.start(module_id: module_id, payload: ModuleManager.start_payload(ctx.module)) - - message_channel = Channel(String).new - - pm.debug(module_id) do |message| - message_channel.send(message) - nil + edge = if existing_edge_id = mod.edge_id + Model::Edge.find!(existing_edge_id) + else + Model::Generator.edge.save! + end + + mod.edge_id = edge.id.as(String) + mod.running = true + mod.save! + + client, process_manager, client_ws, server_ws = client_server(edge.id.as(String)) + + begin + # Reconcile the desired state locally on the edge. Websocket is only used + # for realtime traffic after this point. + snapshot = ::PlaceOS::Edge::State::Snapshot.new( + edge_id: edge.id.as(String), + version: Time.utc.to_unix_ms.to_s, + last_modified: Time.utc, + drivers: [::PlaceOS::Edge::State::DesiredDriver.new(driver_key)], + modules: [::PlaceOS::Edge::State::DesiredModule.new( + module_id: mod.id.as(String), + driver_key: driver_key, + running: true, + payload: ModuleManager.start_payload(mod) + )] + ) + client.apply_snapshot(snapshot) + + module_id = mod.id.as(String) + deadline = Time.instant + 2.seconds + until client.driver_loaded?(driver_key) && client.module_loaded?(module_id) + raise "timed out waiting for edge snapshot reconciliation" if Time.instant >= deadline + sleep 20.milliseconds end - result, code = pm.execute(module_id: module_id, payload: ModuleManager.execute_payload(:echo, ["hello"]), user_id: nil) - result.should eq %("hello") - code.should eq 200 + ctx = Context.new( + module: mod, + edge: edge, + driver_path: driver_path, + driver_key: driver_key, + ) - select - when message = message_channel.receive - message.should eq %([1,"hello"]) - when timeout 2.seconds - raise "timeout" - end - end - end - - describe "driver_loaded?" do - it "confirms a driver is loaded" do - with_edge do |ctx, client, pm| - pm.load(module_id: "mod", driver_key: ctx.driver_key) - client.driver_loaded?(ctx.driver_key).should be_true - pm.driver_loaded?(ctx.driver_key).should be_true - end - end - - it "confirms a driver is not loaded" do - with_edge do |_ctx, client, pm| - pm.driver_loaded?("does-not-exist").should be_false - client.driver_loaded?("does-not-exist").should be_false - end - end - end - - describe "driver_status" do - it "returns driver status if present" do - with_edge do |ctx, client, pm| - pm.load(module_id: "mod", driver_key: ctx.driver_path) - - pm.driver_status(ctx.driver_path).should_not be_nil - status = client.driver_status(ctx.driver_key) - status.should_not be_nil - status.not_nil!.running.should be_false - status.not_nil!.launch_count.should eq(0) - end - end - - it "returns nil in not present" do - with_edge do |_ctx, client, pm| - pm.driver_status("doesntexist").should be_nil - client.driver_status("doesntexist").should be_nil - end + yield ({ctx, client, process_manager}) + ensure + client.runtime_manager.kill(driver_key) rescue nil + client.disconnect + process_manager.transport.disconnect rescue nil + client_ws.close rescue nil + server_ws.close rescue nil end end + end - it "execute" do - with_edge do |ctx, _client, pm| + describe Edge, tags: ["edge", "processes"] do + it "executes requests and reports runtime status from the edge runtime" do + with_edge do |ctx, client, pm| module_id = ctx.module.id.as(String) - pm.load(module_id: module_id, driver_key: ctx.driver_path) - pm.start(module_id: module_id, payload: ModuleManager.start_payload(ctx.module)) result, code = pm.execute(module_id: module_id, payload: ModuleManager.execute_payload(:used_for_place_testing), user_id: nil) result.should eq %("you can delete this file") code.should eq 200 - end - end - - it "ignore" do - with_edge do |ctx, _client, pm| - module_id = ctx.module.id.as(String) - pm.load(module_id: module_id, driver_key: ctx.driver_path) - pm.start(module_id: module_id, payload: ModuleManager.start_payload(ctx.module)) - message_channel = Channel(String).new - - callback = ->(message : String) do - message_channel.send message - nil - end - - pm.debug(module_id, &callback) - result, code = pm.execute(module_id: module_id, payload: ModuleManager.execute_payload(:echo, ["hello"]), user_id: nil) - result.should eq %("hello") - code.should eq 200 - - select - when message = message_channel.receive - message.should eq %([1,"hello"]) - when timeout 2.seconds - raise "timeout" - end - pm.ignore(module_id, &callback) - result, code = pm.execute(module_id: module_id, payload: ModuleManager.execute_payload(:echo, ["hello"]), user_id: nil) - result.should eq %("hello") - code.should eq 200 + pm.runtime_status.connected.should be_true + pm.runtime_status.last_seen.should_not be_nil + pm.edge_id.should eq(ctx.edge.id) - expect_raises(Exception) do - select - when message = message_channel.receive - when timeout 0.5.seconds - raise "timeout" - end - end + client.driver_loaded?(ctx.driver_key).should be_true + client.module_loaded?(module_id).should be_true + client.driver_status(ctx.driver_key).should_not be_nil + client.loaded_modules.should eq({ctx.driver_key => [module_id]}) end end - it "kill" do + it "kills edge-hosted drivers from core" do with_edge do |ctx, client, pm| - test_starting(pm, ctx.module, ctx.driver_key) - pid = client.protocol_manager_by_driver?(ctx.driver_key).try(&.pid).not_nil! - Process.exists?(pid).should be_true - pm.kill(ctx.driver_path).should be_true - success = Channel(Nil).new + pm.kill(ctx.driver_key).should be_true + success = Channel(Nil).new spawn do while Process.exists?(pid) sleep 100.milliseconds @@ -192,118 +132,52 @@ module PlaceOS::Core::ProcessManager end end - it "load" do + it "round-trips lifecycle commands over the realtime channel" do with_edge do |ctx, client, pm| - pm.driver_loaded?(ctx.driver_path).should be_false - pm.module_loaded?("mod").should be_false - client.module_loaded?("mod").should be_false - - pm.load(module_id: "mod", driver_key: ctx.driver_path) - - pm.driver_loaded?(ctx.driver_path).should be_true - client.driver_loaded?(ctx.driver_key).should be_true - - pm.module_loaded?("mod").should be_true - client.module_loaded?("mod").should be_true - end - end + module_id = ctx.module.id.as(String) - it "loaded_modules" do - with_edge do |ctx, _client, pm| - test_starting(pm, ctx.module, ctx.driver_key) - end - end + pm.unload(module_id).should be_true - describe "module_loaded?" do - it "confirms a module is loaded" do - with_edge do |ctx, _client, pm| - pm.load(module_id: "mod", driver_key: ctx.driver_path) - pm.module_loaded?("mod").should be_true + deadline = Time.instant + 2.seconds + until !client.module_loaded?(module_id) + raise "timed out waiting for edge unload" if Time.instant >= deadline + sleep 20.milliseconds end - end - it "confirms a module is not loaded" do - with_edge do |_ctx, _client, pm| - pm.module_loaded?("does-not-exist").should be_false + pm.load(module_id, ctx.driver_key).should be_true + pm.start(module_id, ModuleManager.start_payload(ctx.module)).should be_true + + deadline = Time.instant + 2.seconds + until client.module_loaded?(module_id) + raise "timed out waiting for edge reload" if Time.instant >= deadline + sleep 20.milliseconds end - end - end - it "run_count" do - with_edge do |ctx, _client, pm| - pm.load(module_id: "mod", driver_key: ctx.driver_path) - pm.run_count.should eq(ProcessManager::Count.new(1, 1)) + pm.stop(module_id).should be_true + client.module_loaded?(module_id).should be_true end end - pending "save_setting" do - end - - pending "on_redis" do - end - - it "start" do + it "fails execute cleanly when the edge disconnects" do with_edge do |ctx, client, pm| - module_id = ctx.module.id.as(String) - pm.load(module_id: module_id, driver_key: ctx.driver_path) - pm.start(module_id: module_id, payload: ModuleManager.start_payload(ctx.module)) - pm.loaded_modules.should eq({ctx.driver_key => [module_id]}) - client.loaded_modules.should eq({ctx.driver_key => [module_id]}) - pm.kill(ctx.driver_path) - end - end + client.disconnect - it "stop" do - with_edge do |ctx, _client, pm| - pm.kill(ctx.driver_path) - test_starting(pm, ctx.module, ctx.driver_key) - pm.stop(ctx.module.id.as(String)) - - sleep 100.milliseconds - pm.loaded_modules.should eq({ctx.driver_key => [] of String}) - end - end - - it "system_status" do - with_edge do |_ctx, _client, pm| - pm.system_status.should be_a(SystemStatus) - end - end - - describe "unload" do - it "removes driver if no dependent modules running" do - with_edge do |ctx, _client, pm| - pm.system_status.should be_a(SystemStatus) - path = ctx.driver_path + UUID.random.to_s - module_id = "mod" - File.copy(ctx.driver_path, path) - - pm.load(module_id: module_id, driver_key: path) - pm.driver_loaded?(path).should be_true - pm.module_loaded?(module_id).should be_true - pm.unload(module_id) - pm.driver_loaded?(path).should be_false - pm.module_loaded?(module_id).should be_false + deadline = Time.instant + 2.seconds + until !pm.runtime_status.connected + raise "timed out waiting for edge disconnect" if Time.instant >= deadline + sleep 20.milliseconds end - end - it "keeps driver if dependent modules still running" do - with_edge do |ctx, _client, pm| - path = ctx.driver_path + UUID.random.to_s - module0 = "mod0" - module1 = "mod1" - File.copy(ctx.driver_path, path) - - pm.load(module_id: module0, driver_key: path) - pm.load(module_id: module1, driver_key: path) - pm.driver_loaded?(path).should be_true - pm.module_loaded?(module0).should be_true - pm.module_loaded?(module1).should be_true - pm.unload(module0) - pm.module_loaded?(module0).should be_false - pm.module_loaded?(module1).should be_true - pm.driver_loaded?(path).should be_true + error = expect_raises(PlaceOS::Driver::RemoteException) do + pm.execute( + module_id: ctx.module.id.as(String), + payload: ModuleManager.execute_payload(:used_for_place_testing), + user_id: nil + ) end + + error.message.to_s.should contain("is not connected") + error.code.should eq 503 end end end diff --git a/spec/processes/local_spec.cr b/spec/processes/local_spec.cr index 00fc19e4..cc91c69e 100644 --- a/spec/processes/local_spec.cr +++ b/spec/processes/local_spec.cr @@ -1,42 +1,41 @@ -require "../helper" +require "./support" module PlaceOS::Core::ProcessManager - class_getter(store : DriverStore) { DriverStore.new } + describe Local, tags: "processes" do + managed = [] of Local - def self.with_driver(&) - _, driver, mod = setup(role: PlaceOS::Model::Driver::Role::Service) - result = DriverResource.load(driver, store, true) + build_pm = -> { + Local.new(discovery_mock).tap do |pm| + managed << pm + end + } - driver_key = ProcessManager.path_to_key(result.path) - puts "\n\nPROCESSING DRIVER key: #{driver_key}, path: #{result.path}, driver: #{driver.inspect}\n\n" - yield mod, result.path, driver_key, driver - end + Spec.before_each do + managed.clear + end - def self.test_starting(manager, mod, driver_key) - module_id = mod.id.as(String) - manager.load(module_id: module_id, driver_key: driver_key) - manager.start(module_id: module_id, payload: ModuleManager.start_payload(mod)) - manager.loaded_modules.should eq({driver_key => [module_id]}) - end + Spec.after_each do + managed.each(&.shutdown) + managed.clear + end - describe Local, tags: "processes" do with_driver do |mod, driver_path, driver_key, _driver| describe Local::Common do describe "driver_loaded?" do it "confirms a driver is loaded" do - pm = Local.new(discovery_mock) + pm = build_pm.call pm.load(module_id: "mod", driver_key: driver_key) pm.driver_loaded?(driver_key).should be_true end it "confirms a driver is not loaded" do - Local.new(discovery_mock).driver_loaded?("does-not-exist").should be_false + build_pm.call.driver_loaded?("does-not-exist").should be_false end end describe "driver_status" do it "returns driver status if present" do - pm = Local.new(discovery_mock) + pm = build_pm.call pm.load(module_id: "mod", driver_key: driver_key) status = pm.driver_status(driver_key) @@ -44,12 +43,12 @@ module PlaceOS::Core::ProcessManager end it "returns nil in not present" do - Local.new(discovery_mock).driver_status("doesntexist").should be_nil + build_pm.call.driver_status("doesntexist").should be_nil end end it "execute" do - pm = Local.new(discovery_mock) + pm = build_pm.call module_id = mod.id.as(String) pm.load(module_id: module_id, driver_key: driver_key) pm.start(module_id: module_id, payload: ModuleManager.start_payload(mod)) @@ -59,7 +58,7 @@ module PlaceOS::Core::ProcessManager end it "debug" do - pm = Local.new(discovery_mock) + pm = build_pm.call module_id = mod.id.as(String) pm.load(module_id: module_id, driver_key: driver_key) pm.start(module_id: module_id, payload: ModuleManager.start_payload(mod)) @@ -74,10 +73,11 @@ module PlaceOS::Core::ProcessManager code.should eq 200 messages = [] of String - 2.times do + 6.times do select when message = message_channel.receive messages << message + break if message == %([1,"hello"]) when timeout 2.seconds break end @@ -87,7 +87,7 @@ module PlaceOS::Core::ProcessManager end it "ignore" do - pm = Local.new(discovery_mock) + pm = build_pm.call module_id = mod.id.as(String) pm.load(module_id: module_id, driver_key: driver_key) pm.start(module_id: module_id, payload: ModuleManager.start_payload(mod)) @@ -99,6 +99,16 @@ module PlaceOS::Core::ProcessManager pm.debug(module_id, &callback) + # Drain startup/status debug noise so the next assertion only checks + # messages associated with the execute request below. + loop do + select + when message_channel.receive + when timeout 200.milliseconds + break + end + end + result, code = pm.execute(module_id: module_id, payload: ModuleManager.execute_payload(:echo, ["hello"]), user_id: nil) result.should eq %("hello") code.should eq 200 @@ -131,7 +141,7 @@ module PlaceOS::Core::ProcessManager end it "start" do - pm = Local.new(discovery_mock) + pm = build_pm.call module_id = mod.id.as(String) pm.load(module_id: module_id, driver_key: driver_key) pm.start(module_id: module_id, payload: ModuleManager.start_payload(mod)) @@ -140,7 +150,7 @@ module PlaceOS::Core::ProcessManager end it "stop" do - pm = Local.new(discovery_mock) + pm = build_pm.call pm.kill(driver_key) test_starting(pm, mod, driver_key) pm.stop(mod.id.as(String)) @@ -150,11 +160,11 @@ module PlaceOS::Core::ProcessManager end it "system_status" do - Local.new(discovery_mock).system_status.should be_a(SystemStatus) + build_pm.call.system_status.should be_a(SystemStatus) end it "kill" do - pm = Local.new(discovery_mock) + pm = build_pm.call test_starting(pm, mod, driver_key) pid = pm.protocol_manager_by_driver?(driver_key).try(&.pid).not_nil! @@ -179,25 +189,25 @@ module PlaceOS::Core::ProcessManager end it "loaded_modules" do - pm = Local.new(discovery_mock) + pm = build_pm.call test_starting(pm, mod, driver_key) pm.kill(driver_key) end describe "module_loaded?" do it "confirms a module is loaded" do - pm = Local.new(discovery_mock) + pm = build_pm.call pm.load(module_id: "mod", driver_key: driver_key) pm.module_loaded?("mod").should be_true end it "confirms a module is not loaded" do - Local.new(discovery_mock).module_loaded?("does-not-exist").should be_false + build_pm.call.module_loaded?("does-not-exist").should be_false end end it "run_count" do - pm = Local.new(discovery_mock) + pm = build_pm.call pm.load(module_id: "mod", driver_key: driver_key) pm.run_count.should eq(ProcessManager::Count.new(1, 1)) end @@ -208,7 +218,7 @@ module PlaceOS::Core::ProcessManager module_id = "mod" File.copy(driver_path, path) - pm = Local.new(discovery_mock) + pm = build_pm.call pm.load(module_id: module_id, driver_key: path) pm.driver_loaded?(path).should be_true pm.module_loaded?(module_id).should be_true @@ -226,7 +236,7 @@ module PlaceOS::Core::ProcessManager module1 = "mod1" File.copy(driver_path, path) - pm = Local.new(discovery_mock) + pm = build_pm.call pm.load(module_id: module0, driver_key: path) pm.load(module_id: module1, driver_key: path) pm.driver_loaded?(path).should be_true @@ -244,7 +254,7 @@ module PlaceOS::Core::ProcessManager end it "load" do - pm = Local.new(discovery_mock) + pm = build_pm.call pm.driver_loaded?(driver_key).should be_false pm.module_loaded?("mod").should be_false pm.load(module_id: "mod", driver_key: driver_key) @@ -252,10 +262,41 @@ module PlaceOS::Core::ProcessManager pm.module_loaded?("mod").should be_true end - pending "on_exec" do + it "on_exec" do + module_manager = module_manager_mock + pm = module_manager.local_processes + _, _, current_mod = setup(role: PlaceOS::Model::Driver::Role::Service) + module_id = current_mod.id.as(String) + pm.load(module_id: module_id, driver_key: driver_key) + pm.start(module_id: module_id, payload: ModuleManager.start_payload(current_mod)) + + request = Request.new(module_id, :exec, ModuleManager.execute_payload(:used_for_place_testing)) + response_channel = Channel(Request).new(1) + + pm.on_exec(request, ->(response : Request) { + response_channel.send(response) + }) + + response = response_channel.receive + response.cmd.should eq Request::Command::Result + response.error.should be_nil + response.code.should eq 200 + response.payload.should eq %("you can delete this file") + ensure + module_manager.try &.stop end - pending "save_setting" do + it "save_setting" do + pm = build_pm.call + _, _, current_mod = setup(role: PlaceOS::Model::Driver::Role::Service) + module_id = current_mod.id.as(String) + + pm.on_setting(module_id, "spec_key", YAML::Any.new("spec_value")) + + refreshed = Model::Module.find!(module_id) + setting = refreshed.settings_at?(:none) + setting.should_not be_nil + setting.not_nil!.any[YAML::Any.new("spec_key")].raw.should eq "spec_value" end end end diff --git a/spec/processes/support.cr b/spec/processes/support.cr new file mode 100644 index 00000000..d43e3ba9 --- /dev/null +++ b/spec/processes/support.cr @@ -0,0 +1,20 @@ +require "../helper" + +module PlaceOS::Core::ProcessManager + class_getter(store : DriverStore) { DriverStore.new } + + def self.with_driver(&) + _, driver, mod = setup(role: PlaceOS::Model::Driver::Role::Service) + result = DriverResource.load(driver, store, true) + + driver_key = ProcessManager.path_to_key(result.path) + yield mod, result.path, driver_key, driver + end + + def self.test_starting(manager, mod, driver_key) + module_id = mod.id.as(String) + manager.load(module_id: module_id, driver_key: driver_key) + manager.start(module_id: module_id, payload: ModuleManager.start_payload(mod)) + manager.loaded_modules.should eq({driver_key => [module_id]}) + end +end diff --git a/src/api/application.cr b/src/api/application.cr index 645cbbd2..e5d359af 100644 --- a/src/api/application.cr +++ b/src/api/application.cr @@ -68,7 +68,7 @@ module PlaceOS::Core::Api @[AC::Route::Exception(AC::Route::NotAcceptable, status_code: HTTP::Status::NOT_ACCEPTABLE)] @[AC::Route::Exception(AC::Route::UnsupportedMediaType, status_code: HTTP::Status::UNSUPPORTED_MEDIA_TYPE)] def bad_media_type(error) : ContentError - ContentError.new error: error.message.not_nil!, accepts: error.accepts + ContentError.new error: error.message.to_s, accepts: error.accepts end # Provides details on which parameter is missing or invalid @@ -88,7 +88,7 @@ module PlaceOS::Core::Api @[AC::Route::Exception(AC::Route::Param::MissingError, status_code: HTTP::Status::UNPROCESSABLE_ENTITY)] @[AC::Route::Exception(AC::Route::Param::ValueError, status_code: HTTP::Status::BAD_REQUEST)] def invalid_param(error) : ParameterError - ParameterError.new error: error.message.not_nil!, parameter: error.parameter, restriction: error.restriction + ParameterError.new error: error.message.to_s, parameter: error.parameter, restriction: error.restriction end end end diff --git a/src/api/chaos.cr b/src/api/chaos.cr index 59c31bf5..3f2c150d 100644 --- a/src/api/chaos.cr +++ b/src/api/chaos.cr @@ -6,7 +6,7 @@ module PlaceOS::Core::Api class Chaos < Application base "/api/core/v1/chaos/" - getter module_manager : ModuleManager { ModuleManager.instance } + getter module_manager : ModuleManager { Services.module_manager } # Terminate a process by executable path @[AC::Route::POST("/terminate")] @@ -17,7 +17,7 @@ module PlaceOS::Core::Api edge_id : String? = nil, ) : Nil raise Error::NotFound.new("no process manager found for #{driver_key}") unless manager = module_manager.process_manager(driver_key, edge_id) - manager.kill(driver_key) + raise Error::NotFound.new("driver #{driver_key} is not running") unless manager.kill(driver_key) end end end diff --git a/src/api/command.cr b/src/api/command.cr index 3d2d0059..591b7828 100644 --- a/src/api/command.cr +++ b/src/api/command.cr @@ -5,7 +5,7 @@ module PlaceOS::Core::Api class Command < Application base "/api/core/v1/command/" - property module_manager : ModuleManager { ModuleManager.instance } + getter module_manager : ModuleManager { Services.module_manager } # Loads if not already loaded # If the module is already running, it will be updated to latest settings. diff --git a/src/api/drivers.cr b/src/api/drivers.cr index 7c4ce42d..c83ace50 100644 --- a/src/api/drivers.cr +++ b/src/api/drivers.cr @@ -18,8 +18,12 @@ module PlaceOS::Core::Api @[AC::Param::Info(description: "the driver database id", example: "driver-GFEaAlJB5")] tag : String, ) : Bool - driver = Model::Driver.find!(tag) - repository = driver.repository! + driver = Model::Driver.find?(tag) + raise Error::NotFound.new("driver #{tag} not found") unless driver + + repository = driver.repository || driver.repository_id.try { |id| Model::Repository.find?(id) } + raise Error::NotFound.new("repository for driver #{tag} not found") unless repository + store.compiled?(driver_file, commit, repository.branch, repository.uri) end @@ -33,8 +37,12 @@ module PlaceOS::Core::Api @[AC::Param::Info(description: "the driver database id", example: "driver-GFEaAlJB5")] tag : String, ) : String - driver = Model::Driver.find!(tag) - repository = driver.repository! + driver = Model::Driver.find?(tag) + raise Error::NotFound.new("driver #{tag} not found") unless driver + + repository = driver.repository || driver.repository_id.try { |id| Model::Repository.find?(id) } + raise Error::NotFound.new("repository for driver #{tag} not found") unless repository + result = store.compile(driver_file, repository.uri, commit, repository.branch, true, repository.username, repository.decrypt_password, false) if result.success render text: "OK" @@ -66,7 +74,8 @@ module PlaceOS::Core::Api branch : String = "master", ) : Nil Log.context.set(driver: driver_file, repository: repository, commit: commit, branch: branch) - repo = Model::Repository.find!(repository) + repo = Model::Repository.find?(repository) + raise Error::NotFound.new("repository #{repository} not found") unless repo defaults = store.defaults(driver_file, commit, branch, repo.uri) if defaults.success response.headers["Content-Type"] = "application/json" diff --git a/src/api/edge.cr b/src/api/edge.cr index 1424854b..d4ab602f 100644 --- a/src/api/edge.cr +++ b/src/api/edge.cr @@ -1,12 +1,16 @@ require "./application" require "../placeos-core/module_manager" +require "../placeos-edge/state" +require "placeos-models/edge" +require "placeos-models/module" +require "placeos-models/settings" module PlaceOS::Core::Api class Edge < Application base "/api/core/v1/edge/" - getter module_manager : ModuleManager { ModuleManager.instance } + getter module_manager : ModuleManager { Services.module_manager } # websocket handling edge connections @[AC::Route::WebSocket("/control")] @@ -17,5 +21,91 @@ module PlaceOS::Core::Api ) : Nil module_manager.manage_edge(edge_id, socket) end + + @[AC::Route::GET("/:edge_id/desired_state")] + def desired_state( + @[AC::Param::Info(description: "the edge id we want the desired runtime state for", example: "edge-1234")] + edge_id : String, + ) : PlaceOS::Edge::State::Snapshot | Nil + edge = Model::Edge.find?(edge_id) + raise Error::NotFound.new("edge #{edge_id} not found in database") unless edge + + modules = Model::Module.on_edge(edge_id).to_a + last_modified = edge_last_modified(edge, modules) + return unless stale?(last_modified: last_modified) + + drivers = [] of PlaceOS::Edge::State::DesiredDriver + desired_modules = modules.compact_map do |mod| + driver = mod.driver || mod.driver_id.try { |id| Model::Driver.find?(id) } + next unless driver + + repository = driver.repository || driver.repository_id.try { |id| Model::Repository.find?(id) } + next unless repository + + driver_path = DriverStore.new.built?(driver.file_name, driver.commit, repository.branch, repository.uri) + next unless driver_path + + driver_key = Path[driver_path].basename.to_s + drivers << PlaceOS::Edge::State::DesiredDriver.new(driver_key) + PlaceOS::Edge::State::DesiredModule.new( + module_id: mod.id.as(String), + driver_key: driver_key, + running: mod.running, + payload: ModuleManager.start_payload(mod) + ) + end + + PlaceOS::Edge::State::Snapshot.new( + edge_id: edge_id, + version: last_modified.to_unix_ms.to_s, + last_modified: last_modified, + drivers: drivers.uniq! { |driver| driver.key } || drivers, + modules: desired_modules + ) + end + + @[AC::Route::GET("/:edge_id/drivers/:driver_key")] + def driver_binary( + @[AC::Param::Info(description: "the edge id we want to stream a driver for", example: "edge-1234")] + edge_id : String, + @[AC::Param::Info(description: "the compiled driver key", example: "drivers_place_meet_abcdef0_amd64")] + driver_key : String, + ) : Nil + raise Error::NotFound.new("edge #{edge_id} not found in database") unless Model::Edge.find?(edge_id) + + path = DriverStore.new.path(driver_key) + raise Error::NotFound.new("driver #{driver_key} not found") unless File.exists?(path) + + response.headers["Content-Type"] = "application/octet-stream" + render binary: File.read(path) + end + + private def edge_last_modified(edge : Model::Edge, modules : Array(Model::Module)) : Time + timestamps = [edge.updated_at || edge.created_at || Time.utc] + + modules.each do |mod| + timestamps << (mod.updated_at || mod.created_at || Time.utc) + + if driver = mod.driver + timestamps << (driver.updated_at || driver.created_at || Time.utc) + driver.settings.each do |setting| + timestamps << (setting.updated_at || setting.created_at || Time.utc) + end + end + + mod.settings.each do |setting| + timestamps << (setting.updated_at || setting.created_at || Time.utc) + end + + if control_system = mod.control_system + timestamps << (control_system.updated_at || control_system.created_at || Time.utc) + control_system.settings.each do |setting| + timestamps << (setting.updated_at || setting.created_at || Time.utc) + end + end + end + + timestamps.max + end end end diff --git a/src/api/root.cr b/src/api/root.cr index ffc8313c..581dc8b9 100644 --- a/src/api/root.cr +++ b/src/api/root.cr @@ -7,8 +7,13 @@ module PlaceOS::Core::Api class Root < Application base "/api/core/v1/" - class_getter resource_manager : ResourceManager { ResourceManager.instance } - class_getter module_manager : ModuleManager { ModuleManager.instance } + def self.resource_manager : ResourceManager + Services.resource_manager + end + + def self.module_manager : ModuleManager + Services.module_manager + end # Health Check ############################################################################################### diff --git a/src/api/status.cr b/src/api/status.cr index f21a6768..7532e9af 100644 --- a/src/api/status.cr +++ b/src/api/status.cr @@ -8,8 +8,8 @@ module PlaceOS::Core::Api class Status < Application base "/api/core/v1/status/" - getter module_manager : ModuleManager { ModuleManager.instance } - getter resource_manager : ResourceManager { ResourceManager.instance } + getter module_manager : ModuleManager { Services.module_manager } + getter resource_manager : ResourceManager { Services.resource_manager } record(RunCount, local : PlaceOS::Core::ProcessManager::Count, edge : Hash(String, PlaceOS::Core::ProcessManager::Count)) { include JSON::Serializable } @@ -65,6 +65,16 @@ module PlaceOS::Core::Api record(LoadedModules, local : Hash(String, Array(String)), edge : Hash(String, Hash(String, Array(String)))) { include JSON::Serializable } + record(EdgeConnection, + online : Bool, + last_seen : Time?, + websocket_connected : Bool, + snapshot_version : String?, + pending_updates : Int32, + pending_events : Int32, + last_event : String?, + last_error : String?) { include JSON::Serializable } + # Returns the lists of modules drivers have loaded for this core, and managed edges @[AC::Route::GET("/loaded")] def loaded : LoadedModules @@ -73,5 +83,25 @@ module PlaceOS::Core::Api edge: module_manager.edge_processes.loaded_modules, ) end + + @[AC::Route::GET("/edges")] + def edges : Hash(String, EdgeConnection) + statuses = module_manager.edge_processes.runtime_status + + PlaceOS::Model::Edge.all.each_with_object({} of String => EdgeConnection) do |edge, acc| + edge_id = edge.id.as(String) + runtime = statuses[edge_id]? + acc[edge_id] = EdgeConnection.new( + online: edge.online, + last_seen: edge.last_seen, + websocket_connected: runtime.try(&.connected) || false, + snapshot_version: runtime.try(&.snapshot_version), + pending_updates: runtime.try(&.pending_updates) || 0, + pending_events: runtime.try(&.pending_events) || 0, + last_event: runtime.try(&.last_event), + last_error: runtime.try(&.last_error) + ) + end + end end end diff --git a/src/edge-app.cr b/src/edge-app.cr index 1349e5c2..92bc2e1d 100644 --- a/src/edge-app.cr +++ b/src/edge-app.cr @@ -10,6 +10,7 @@ require "./logging" module PlaceOS::Edge uri = PLACE_URI secret = CLIENT_SECRET + edge_id = EDGE_ID # Command line options OptionParser.parse(ARGV.dup) do |parser| @@ -18,6 +19,7 @@ module PlaceOS::Edge parser.on("-u", "--uri", "Set URI for PlaceOS instance") { |u| uri = URI.parse(u) } parser.on("-s", "--secret", "Set application secret") { |s| secret = s } + parser.on("-i EDGE_ID", "--edge-id=EDGE_ID", "Set edge identifier") { |id| edge_id = id } parser.on("-v", "--version", "Display the application version") do puts "#{APP_NAME} v#{VERSION}" @@ -31,7 +33,21 @@ module PlaceOS::Edge end Log.info { "starting #{APP_NAME} v#{VERSION}" } - Client.new(uri, secret).connect do + + client = Client.new(uri, secret, edge_id: edge_id) + + # Handle graceful shutdown + shutdown = -> do + Log.info { "shutting down gracefully..." } + client.disconnect + sleep 100.milliseconds # Give time for cleanup + exit(0) + end + + Signal::INT.trap { shutdown.call } + Signal::TERM.trap { shutdown.call } + + client.connect do Log.info { "started #{APP_NAME} connected to #{uri}" } end end diff --git a/src/placeos-core/driver_manager.cr b/src/placeos-core/driver_manager.cr index 03b03154..f86129ce 100644 --- a/src/placeos-core/driver_manager.cr +++ b/src/placeos-core/driver_manager.cr @@ -13,7 +13,7 @@ module PlaceOS::Core def initialize( @startup : Bool = true, @binary_dir : String = "#{Dir.current}/bin/drivers", - @module_manager : ModuleManager = ModuleManager.instance, + @module_manager : ModuleManager = Services.module_manager, ) @store = DriverStore.new buffer_size = System.cpu_count.to_i @@ -45,10 +45,14 @@ module PlaceOS::Core driver : Model::Driver, store : DriverStore, startup : Bool = false, - module_manager : ModuleManager = ModuleManager.instance, + module_manager : ModuleManager = Services.module_manager, ) : Core::Result driver_id = driver.id.as(String) - repository = driver.repository! + repository = driver.repository || driver.repository_id.try { |id| Model::Repository.find?(id) } + unless repository + Log.debug { {message: "deferring driver load until repository exists", driver_id: driver_id, repository_id: driver.repository_id} } + return Core::Result.new(success: true) + end force_recompile = driver.recompile_commit? commit = force_recompile.nil? ? driver.commit : force_recompile @@ -97,10 +101,13 @@ module PlaceOS::Core # (Re)load modules onto the newly compiled driver stale_path = module_manager.reload_modules(driver) + current_path = store.driver_binary_path(driver.file_name, commit) - # Remove the stale driver if there was one - remove_stale_driver(driver_id: driver_id, + # Remove the stale driver only if it differs from the newly compiled artifact. + remove_stale_driver( + driver_id: driver_id, path: stale_path, + current_path: current_path, ) # Bump the commit on the driver post-compilation and module loading @@ -113,14 +120,37 @@ module PlaceOS::Core # Remove the stale driver binary if there was one # - def self.remove_stale_driver(path : Path?, driver_id : String) + def self.remove_stale_driver(path : Path?, driver_id : String, current_path : Path? = nil) return unless path + if current_path && path == current_path + Log.debug { {message: "skipping stale driver removal for current binary", driver_id: driver_id, path: path.to_s} } + return + end + if binary_in_use?(path) + Log.info { {message: "skipping stale driver removal for running binary", driver_id: driver_id, path: path.to_s} } + return + end Log.info { {message: "removing stale driver binary", driver_id: driver_id, path: path.to_s} } File.delete(path) if File.exists?(path) rescue Log.error { {message: "failed to remove stale binary", driver_id: driver_id, path: path.to_s} } end + private def self.binary_in_use?(path : Path) : Bool + target = path.to_s + Dir.each_child("/proc") do |entry| + next unless entry =~ /^\d+$/ + + begin + exe_path = File.readlink("/proc/#{entry}/exe") + return true if exe_path == target + rescue + next + end + end + false + end + def self.update_driver_commit(driver : Model::Driver, commit : String, startup : Bool) if startup # There's a potential for multiple writers on startup, However this is an eventually consistent operation. diff --git a/src/placeos-core/driver_manager/build_api.cr b/src/placeos-core/driver_manager/build_api.cr index 159210c4..61a259b1 100644 --- a/src/placeos-core/driver_manager/build_api.cr +++ b/src/placeos-core/driver_manager/build_api.cr @@ -48,8 +48,8 @@ module PlaceOS::Core host = URI.parse(Core.build_host) file_name = URI.encode_www_form(file_name) headers = HTTP::Headers.new - headers["X-Git-Username"] = username.not_nil! unless username.nil? - headers["X-Git-Password"] = password.not_nil! unless password.nil? + headers["X-Git-Username"] = username if username + headers["X-Git-Password"] = password if password resp = ConnectProxy::HTTPClient.new(host) do |client| path = "#{BUILD_API_BASE}/#{Core::ARCH}/#{file_name}" diff --git a/src/placeos-core/driver_manager/driver_cleanup.cr b/src/placeos-core/driver_manager/driver_cleanup.cr index a2fb0ac3..cfbe32c0 100644 --- a/src/placeos-core/driver_manager/driver_cleanup.cr +++ b/src/placeos-core/driver_manager/driver_cleanup.cr @@ -9,28 +9,36 @@ module PlaceOS::Core::DriverCleanup DEFAULT_STALE_THRESHOLD_DAYS = 15 @@tasker_inst : Tasker::Repeat(Nil)? + @@tasker_lock = Mutex.new def self.start_cleanup - tracker = StaleProcessTracker.new(DriverStore::BINARY_PATH, REDIS_CLIENT) + @@tasker_lock.synchronize do + @@tasker_inst.try &.cancel - @@tasker_inst = Tasker.every(ENV["STALE_SCAN_INTERVAL"]?.try &.to_i.hours || DEFAULT_STALE_SCAN_INTERVAL) do - stale_list = tracker.update_and_find_stale(ENV["STALE_THRESHOLD_DAYS"]?.try &.to_i || DEFAULT_STALE_THRESHOLD_DAYS) - tracker.delete_stale_executables(stale_list) + tracker = StaleProcessTracker.new(DriverStore::BINARY_PATH, REDIS_CLIENT) + @@tasker_inst = Tasker.every(ENV["STALE_SCAN_INTERVAL"]?.try &.to_i.hours || DEFAULT_STALE_SCAN_INTERVAL) do + stale_list = tracker.update_and_find_stale(ENV["STALE_THRESHOLD_DAYS"]?.try &.to_i || DEFAULT_STALE_THRESHOLD_DAYS) + tracker.delete_stale_executables(stale_list) + end end end def self.stop_cleanup - @@tasker_inst.try &.cancel + @@tasker_lock.synchronize do + @@tasker_inst.try &.cancel + @@tasker_inst = nil + end end class StaleProcessTracker Log = Core::Log + @now : Time = Time.utc def initialize(@folder : String, @redis : Redis::Client) - @now = Time.utc end def update_and_find_stale(days_threshold : Int32 = 30) + @now = Time.utc Log.info { "Starting stale executable check for #{@folder}" } current_executables = get_current_executables @@ -79,7 +87,7 @@ module PlaceOS::Core::DriverCleanup private def track_execution_events(current_executables) # Register new executables with discovery time current_executables.each do |exe| - unless @redis.hexists(exe, "discovered_at") + unless hash_data_for(exe).has_key?("discovered_at") @redis.hset(exe, "discovered_at", @now.to_unix) end end @@ -115,7 +123,7 @@ module PlaceOS::Core::DriverCleanup stale = [] of String current_executables.each do |exe| - redis_data = @redis.hgetall(exe) + redis_data = hash_data_for(exe) discovered_at = redis_data["discovered_at"]?.try(&.to_i64) last_executed_at = redis_data["last_executed_at"]?.try(&.to_i64) @@ -137,6 +145,21 @@ module PlaceOS::Core::DriverCleanup stale end + private def hash_data_for(key : String) : Hash(String, String) + case data = @redis.hgetall(key) + in Hash + data.transform_keys(&.to_s).transform_values(&.to_s) + in Array + hash = {} of String => String + data.each_slice(2) do |slice| + next unless field = slice[0]? + next unless value = slice[1]? + hash[field.to_s] = value.to_s + end + hash + end + end + private def process_owned_by_current_user?(pid_dir : String, current_uid : UInt32) : Bool status_file = File.join(pid_dir, "status") return false unless File.exists?(status_file) @@ -149,10 +172,9 @@ module PlaceOS::Core::DriverCleanup end private def get_process_executable_name(pid_dir : String) : String? - cmdline = File.read(File.join(pid_dir, "cmdline")).split("\0").first? - return unless cmdline - - File.basename(cmdline) + exe_path = File.readlink(File.join(pid_dir, "exe")) + return unless exe_path.starts_with?(@folder) + File.basename(exe_path) rescue nil end diff --git a/src/placeos-core/driver_manager/driver_integrity.cr b/src/placeos-core/driver_manager/driver_integrity.cr index ef7e07b6..b5d3f0e8 100644 --- a/src/placeos-core/driver_manager/driver_integrity.cr +++ b/src/placeos-core/driver_manager/driver_integrity.cr @@ -10,15 +10,22 @@ module PlaceOS::Core::DriverIntegrity include DB::Serializable end @@tasker_inst : Tasker::Repeat(Nil)? + @@tasker_lock = Mutex.new def self.start_integrity_checker - @@tasker_inst = Tasker.every(ENV["INTEGRITY_SCAN_INTERVAL"]?.try &.to_i.hours || DEFAULT_SCAN_INTERVAL) do - sync_drivers + @@tasker_lock.synchronize do + @@tasker_inst.try &.cancel + @@tasker_inst = Tasker.every(ENV["INTEGRITY_SCAN_INTERVAL"]?.try &.to_i.hours || DEFAULT_SCAN_INTERVAL) do + sync_drivers + end end end def self.stop_integrity_checker - @@tasker_inst.try &.cancel + @@tasker_lock.synchronize do + @@tasker_inst.try &.cancel + @@tasker_inst = nil + end end def self.remove_blank_files @@ -79,9 +86,12 @@ module PlaceOS::Core::DriverIntegrity drivers_delta = should_be_running - find_running_drivers return if drivers_delta.empty? drivers_to_start = drivers.select { |rec| (rec.driver_file + Core::ARCH).in?(drivers_delta) } - module_manager = ModuleManager.instance + module_manager = Services.current_module_manager? || ModuleManager.current_instance? + return unless module_manager drivers_to_start.each do |driver| - module_manager.reload_modules(Model::Driver.find!(driver.id)) + if model = Model::Driver.find?(driver.id) + module_manager.reload_modules(model) + end end end diff --git a/src/placeos-core/driver_manager/driver_store.cr b/src/placeos-core/driver_manager/driver_store.cr index c9fec062..7f904c6b 100644 --- a/src/placeos-core/driver_manager/driver_store.cr +++ b/src/placeos-core/driver_manager/driver_store.cr @@ -112,13 +112,16 @@ module PlaceOS::Core def reload_driver(driver_id : String) if driver = Model::Driver.find?(driver_id) - repo = driver.repository! + repo = driver.repository || driver.repository_id.try { |id| Model::Repository.find?(id) } + return {status: 404, message: "Repository for driver #{driver_id} not found"} unless repo if compiled?(driver.file_name, driver.commit, repo.branch, repo.uri) - manager = ModuleManager.instance + manager = Services.current_module_manager? || ModuleManager.current_instance? + return {status: 503, message: "Module manager unavailable"} unless manager stale_path = manager.reload_modules(driver) if path = stale_path - File.delete(path) rescue nil if File.exists?(path) + current_path = driver_binary_path(driver.file_name, driver.commit) + File.delete(path) rescue nil if File.exists?(path) && path != current_path end else return {status: 404, message: "Driver not compiled or not available on S3"} @@ -133,6 +136,7 @@ module PlaceOS::Core url = URI.parse(link.url) driver_file = Path[url.path].basename filename = Path[binary_path, driver_file] + tmp_filename = Path[binary_path, "#{driver_file}.#{Random::Secure.hex(8)}.tmp"] resp = if Core.production? || url.scheme == "https" ConnectProxy::HTTPClient.get(url.to_s) else @@ -149,7 +153,7 @@ module PlaceOS::Core body_io = IO::Digest.new(resp.body_io? || IO::Memory.new(resp.body), Digest::MD5.new) bytes_written = 0_i64 - File.open(filename, "wb+") do |f| + File.open(tmp_filename, "wb+") do |f| bytes_written = IO.copy(body_io, f) f.chmod(0o755) end @@ -157,14 +161,17 @@ module PlaceOS::Core # Verify actual downloaded size matches expected size unless link.size == bytes_written Log.error { {message: "Expected download size #{link.size}, but actually downloaded #{bytes_written} bytes", driver_file: driver_file} } - File.delete(filename) if File.exists?(filename) + File.delete(tmp_filename) if File.exists?(tmp_filename) raise Error.new("Downloaded size doesn't match expected size from build service") end + File.rename(tmp_filename, filename) filename.to_s else raise Error.new("Unable to fetch driver. Error : #{resp.body}") end + ensure + File.delete(tmp_filename) if tmp_filename && File.exists?(tmp_filename) end private record LinkData, size : Int64, md5 : String, modified : Time, url : String, link_expiry : Time do diff --git a/src/placeos-core/mappings/control_system_modules.cr b/src/placeos-core/mappings/control_system_modules.cr index bd0b8e01..587b1c0b 100644 --- a/src/placeos-core/mappings/control_system_modules.cr +++ b/src/placeos-core/mappings/control_system_modules.cr @@ -12,7 +12,7 @@ module PlaceOS::Core def initialize( @startup : Bool = true, - @module_manager : ModuleManager = ModuleManager.instance, + @module_manager : ModuleManager = Services.module_manager, ) super() end @@ -28,7 +28,7 @@ module PlaceOS::Core def self.update_mapping( system : Model::ControlSystem, startup : Bool = false, - module_manager : ModuleManager = ModuleManager.instance, + module_manager : ModuleManager = Services.module_manager, ) : Resource::Result relevant_node = startup || module_manager.discovery.own_node?(system.id.as(String)) unless relevant_node @@ -57,7 +57,7 @@ module PlaceOS::Core # def self.update_logic_modules( system : Model::ControlSystem, - module_manager : ModuleManager = ModuleManager.instance, + module_manager : ModuleManager = Services.module_manager, ) : Int32 return 0 if system.destroyed? @@ -111,9 +111,16 @@ module PlaceOS::Core # Construct a hash of resolved module name to ordered module ids grouped_modules = control_system.modules.group_by do |id| # Save a lookup if a module passed - (mod && id == mod.id ? mod : Model::Module.find!(id)).resolved_name + resolved_mod = mod && id == mod.id ? mod : Model::Module.find?(id) + unless resolved_mod + Log.debug { {message: "skipping missing module while rebuilding mappings", system_id: system_id, module_id: id} } + next "" + end + resolved_mod.resolved_name end + grouped_modules.delete("") + # Index the modules mappings = grouped_modules.each_with_object({} of String => String) do |(name, ids), mapping| # Indexes start from 1 diff --git a/src/placeos-core/mappings/module_names.cr b/src/placeos-core/mappings/module_names.cr index d6d8f5c1..66c6dbd4 100644 --- a/src/placeos-core/mappings/module_names.cr +++ b/src/placeos-core/mappings/module_names.cr @@ -10,7 +10,7 @@ module PlaceOS::Core class Mappings::ModuleNames < Resource(Model::Module) protected getter module_manager : ModuleManager - def initialize(@module_manager : ModuleManager = ModuleManager.instance) + def initialize(@module_manager : ModuleManager = Services.module_manager) super() end @@ -25,7 +25,7 @@ module PlaceOS::Core def self.update_module_mapping( mod : Model::Module, - module_manager : ModuleManager = ModuleManager.instance, + module_manager : ModuleManager = Services.module_manager, ) : Resource::Result module_id = mod.id.as(String) # Only consider name change events diff --git a/src/placeos-core/module_manager.cr b/src/placeos-core/module_manager.cr index 164f70df..4c26a51a 100644 --- a/src/placeos-core/module_manager.cr +++ b/src/placeos-core/module_manager.cr @@ -34,10 +34,18 @@ module PlaceOS::Core # These are modules with launch_on_execute that are "running" but driver not spawned getter lazy_modules : Hash(String, Bool) = {} of String => Bool private getter lazy_modules_lock : Mutex = Mutex.new + @@instance_lock = Mutex.new def stop + @local_processes.try &.shutdown + @local_processes = nil + edge_processes.stop + lazy_modules_lock.synchronize do + lazy_modules.clear + end clustering.unregister stop_process_check + @started = false end delegate path_for?, to: local_processes @@ -51,14 +59,35 @@ module PlaceOS::Core # Redis channel that cluster leader publishes stable cluster versions to REDIS_VERSION_CHANNEL = "cluster/cluster_version" + @@instance : ModuleManager? + # Singleton configured from environment - class_getter instance : ModuleManager { ModuleManager.new(uri: self.uri) } + def self.instance : ModuleManager + @@instance_lock.synchronize do + @@instance ||= ModuleManager.new(uri: self.uri) + end + end + + def self.current_instance? : ModuleManager? + @@instance + end + + def self.reset_instance + @@instance_lock.synchronize do + @@instance.try &.stop + @@instance = nil + end + end # Manager for remote edge module processes getter edge_processes : Edge::Server = Edge::Server.new # Manager for local module processes - getter local_processes : ProcessManager::Local { ProcessManager::Local.new(discovery) } + @local_processes : ProcessManager::Local? + + getter local_processes : ProcessManager::Local do + @local_processes ||= ProcessManager::Local.new(discovery, self) + end # Start up process is as follows.. # - registered @@ -147,6 +176,11 @@ module PlaceOS::Core def load_module(mod : Model::Module, rendezvous_hash : RendezvousHash = discovery.rendezvous) module_id = mod.id.as(String) + if on_managed_edge?(mod) + Log.info { {message: "edge module desired state updated", module_id: module_id, edge_id: mod.edge_id} } + return + end + allocated_uri = ModuleManager.core_uri(mod, rendezvous_hash) if allocated_uri == @clustering.uri @@ -156,10 +190,19 @@ module PlaceOS::Core return end - driver = mod.driver! + driver = mod.driver || mod.driver_id.try { |id| Model::Driver.find?(id) } + unless driver + Log.debug { {message: "deferring module load until driver exists", module_id: module_id, driver_id: mod.driver_id} } + return + end + driver_id = driver.id.as(String) # repository_folder = driver.repository.not_nil!.folder_name - repository = driver.repository! + repository = driver.repository || driver.repository_id.try { |id| Model::Repository.find?(id) } + unless repository + Log.debug { {message: "deferring module load until repository exists", module_id: module_id, driver_id: driver_id} } + return + end ::Log.with_context( driver_id: driver_id, @@ -172,7 +215,7 @@ module PlaceOS::Core driver_path = store.built?(driver.file_name, driver.commit, repository.branch, repository.uri) # Check if the driver is built if driver_path.nil? - Log.error { "driver does not exist for module" } + Log.debug { "driver does not exist for module" } return end @@ -185,7 +228,7 @@ module PlaceOS::Core Log.info { {message: "unloading module no longer on node", module_id: module_id} } unload_module(mod) else - Log.warn { {message: "load module request invalid. #{allocated_uri.inspect} != #{@clustering.uri.inspect}", module_id: module_id} } + Log.debug { {message: "load module request invalid. #{allocated_uri.inspect} != #{@clustering.uri.inspect}", module_id: module_id} } end end @@ -194,6 +237,11 @@ module PlaceOS::Core def unload_module(mod : Model::Module) module_id = mod.id.as(String) + if on_managed_edge?(mod) + Log.info { {message: "edge module marked for unload via desired state", module_id: module_id, edge_id: mod.edge_id} } + return + end + # Remove from lazy modules tracking if present unregister_lazy_module(module_id) @@ -206,6 +254,11 @@ module PlaceOS::Core def start_module(mod : Model::Module) module_id = mod.id.as(String) + if on_managed_edge?(mod) + Log.info { {message: "edge module marked running via desired state", module_id: module_id, edge_id: mod.edge_id} } + return + end + # For lazy modules, just ensure metadata is in Redis (driver not spawned yet) if mod.launch_on_execute register_lazy_module(mod) @@ -235,6 +288,11 @@ module PlaceOS::Core def stop_module(mod : Model::Module) module_id = mod.id.as(String) + if on_managed_edge?(mod) + Log.info { {message: "edge module marked stopped via desired state", module_id: module_id, edge_id: mod.edge_id} } + return + end + # For lazy modules, just remove from tracking and clear metadata if mod.launch_on_execute unregister_lazy_module(module_id) @@ -250,9 +308,26 @@ module PlaceOS::Core # Update/start modules with new configuration # def refresh_module(mod : Model::Module) - process_manager(mod) do |_manager| - mod.running.tap { |running| start_module(mod) if running } + return mod.running if on_managed_edge?(mod) + + module_id = mod.id.as(String) + + loaded = process_manager(mod) do |manager| + manager.module_loaded?(module_id) end + + unless loaded || (mod.launch_on_execute && mod.running) + load_module(mod) + loaded = process_manager(mod) do |manager| + manager.module_loaded?(module_id) + end + end + + if mod.running && (loaded || mod.launch_on_execute) + start_module(mod) + end + + mod.running end # Stops modules on stale driver and starts them on the new driver @@ -333,10 +408,11 @@ module PlaceOS::Core def process_manager(mod : Model::Module, & : ProcessManager ->) if mod.on_edge? && (edge_id = mod.edge_id) if (manager = edge_processes.for?(edge_id)).nil? - Log.error { "missing edge manager for #{edge_id}" } + Log.debug { "missing edge manager for #{edge_id}" } return end yield manager + return end yield local_processes @@ -433,7 +509,8 @@ module PlaceOS::Core case mod in String if Model::Module.has_edge_hint?(mod) - Model::Module.find!(mod).edge_id.as(String) + model = Model::Module.find?(mod) + model.try(&.edge_id).try(&.as(String)) || mod else mod end @@ -452,19 +529,78 @@ module PlaceOS::Core ########################################################################### def self.start_payload(mod : Model::Module) + live_mod = mod.id.try { |id| Model::Module.find?(id.as(String)) } || mod + payload_mod = prepare_payload_module(live_mod, fallback: mod) + begin - # Merge module settings - merged_settings = mod.merge_settings + merged_settings = merge_module_settings(payload_mod) rescue e merged_settings = "{}" - Log.error(exception: e) { {message: "Failed to merge module settings", module_id: mod.id, name: mod.name, custom_name: mod.custom_name} } + Log.error(exception: e) { {message: "Failed to merge module settings", module_id: payload_mod.id, name: payload_mod.name, custom_name: payload_mod.custom_name} } end # Start format - payload = mod.to_json.rchop + payload = payload_mod.to_json.rchop # The settings object needs to be unescaped - %(#{payload},"control_system":#{mod.control_system.to_json},"settings":#{merged_settings}}) + %(#{payload},"control_system":#{payload_mod.control_system.to_json},"settings":#{merged_settings}}) + end + + private def self.merge_module_settings(mod : Model::Module) : String + hierarchy = mod.settings + + if mod.role.logic? + control_system = mod.control_system || mod.control_system_id.try { |id| Model::ControlSystem.find?(id) } + if control_system + hierarchy.concat(control_system.settings_hierarchy) + else + raise Model::Error::NoParent.new("Missing control system: module_id=#{mod.id} control_system_id=#{mod.control_system_id}") + end + end + + driver_id = mod.driver_id || mod.driver.try(&.id) + raise Model::Error::NoParent.new("Missing driver: module_id=#{mod.id} driver_id=#{mod.driver_id}") if driver_id.nil? + + hierarchy.concat(Model::Settings.for_parent(driver_id)) + + hierarchy + .compact + .reverse! + .reduce({} of YAML::Any => YAML::Any) do |merged, setting| + begin + merged.merge!(setting.any) + rescue error + Log.warn(exception: error) { "failed to merge settings: #{setting.inspect}" } + end + merged + end + .to_json + end + + private def self.prepare_payload_module(primary : Model::Module, fallback : Model::Module) : Model::Module + hydrate_module_associations!(fallback, fallback: fallback) + hydrate_module_associations!(primary, fallback: fallback) + + return primary unless primary.driver.nil? + return fallback unless fallback.driver.nil? + + primary + end + + private def self.hydrate_module_associations!(target : Model::Module, fallback : Model::Module) + if target.driver.nil? + driver = fallback.driver || + fallback.driver_id.try { |id| Model::Driver.find?(id) } || + target.driver_id.try { |id| Model::Driver.find?(id) } + target.driver = driver unless driver.nil? + end + + if target.control_system.nil? + control_system = fallback.control_system || + fallback.control_system_id.try { |id| Model::ControlSystem.find?(id) } || + target.control_system_id.try { |id| Model::ControlSystem.find?(id) } + target.control_system = control_system unless control_system.nil? + end end def self.execute_payload(method : String | Symbol, args : Enumerable? = nil, named_args : Hash | NamedTuple | Nil = nil) @@ -511,8 +647,17 @@ module PlaceOS::Core # Populate module metadata in Redis from build service (without spawning driver) def populate_lazy_module_metadata(mod : Model::Module) module_id = mod.id.as(String) - driver = mod.driver! - repository = driver.repository! + driver = mod.driver || mod.driver_id.try { |id| Model::Driver.find?(id) } + unless driver + Log.debug { {message: "skipping lazy module metadata until driver exists", module_id: module_id, driver_id: mod.driver_id} } + return + end + + repository = driver.repository || driver.repository_id.try { |id| Model::Repository.find?(id) } + unless repository + Log.debug { {message: "skipping lazy module metadata until repository exists", module_id: module_id, driver_id: driver.id} } + return + end # Fetch metadata from build service result = store.metadata(driver.file_name, driver.commit, repository.branch, repository.uri) diff --git a/src/placeos-core/process_check.cr b/src/placeos-core/process_check.cr index a687d06c..85f82f15 100644 --- a/src/placeos-core/process_check.cr +++ b/src/placeos-core/process_check.cr @@ -9,6 +9,7 @@ module PlaceOS::Core # Begin scanning for dead driver processes protected def start_process_check + stop_process_check @process_check_task = Tasker.every(PROCESS_CHECK_PERIOD) do process_check end @@ -16,6 +17,7 @@ module PlaceOS::Core protected def stop_process_check @process_check_task.try &.cancel + @process_check_task = nil end enum State diff --git a/src/placeos-core/process_manager.cr b/src/placeos-core/process_manager.cr index e0b61448..a38dda34 100644 --- a/src/placeos-core/process_manager.cr +++ b/src/placeos-core/process_manager.cr @@ -24,10 +24,14 @@ module PlaceOS::Core def attach_debugger(module_id : String, socket : HTTP::WebSocket) Log.trace { {message: "binding debug session to module", module_id: module_id} } - channel = Channel(String).new(capacity: 1) + channel = Channel(String).new(capacity: 8) callback : String -> Nil = ->(message : String) do - channel.send(message) unless channel.closed? + begin + channel.send(message) unless channel.closed? + rescue Channel::ClosedError + nil + end end # Stop debugging when the socket closes @@ -36,15 +40,19 @@ module PlaceOS::Core ignore(module_id, &callback) end - # Attach the debug callback for the module - debug(module_id, &callback) - # Asyncronously send debug messages from the module spawn do - while message = channel.receive? - socket.send(message) + begin + while message = channel.receive? + socket.send(message) + end + rescue IO::Error | Channel::ClosedError + nil end end + + # Attach the debug callback for the module after the sender fiber exists + debug(module_id, &callback) end abstract def debug(module_id : String, &_on_message : DebugCallback) @@ -66,7 +74,8 @@ module PlaceOS::Core # Handler for settings updates # def on_setting(id : String, setting_name : String, setting_value : YAML::Any) - mod = PlaceOS::Model::Module.find!(id) + mod = PlaceOS::Model::Module.find?(id) + raise ModuleError.new("Could not locate module #{id}, no matching database record") unless mod if setting = mod.settings_at?(:none) else setting = PlaceOS::Model::Settings.new diff --git a/src/placeos-core/process_manager/common.cr b/src/placeos-core/process_manager/common.cr index 36a17f31..0236b089 100644 --- a/src/placeos-core/process_manager/common.cr +++ b/src/placeos-core/process_manager/common.cr @@ -48,15 +48,57 @@ module PlaceOS::Core::ProcessManager::Common end def kill(driver_key : String) : Bool - !!protocol_manager_by_driver?(driver_key).try do |manager| - pid = manager.pid - Process.signal(Signal::KILL, pid) - true + manager = protocol_manager_by_driver?(driver_key) + return false unless manager + + protocol_manager_lock.synchronize do + @driver_protocol_managers.delete(ProcessManager.path_to_key(driver_key)) + @module_protocol_managers.reject! { |_, current| current == manager } + end + + begin + manager.terminate + rescue + nil end + + pid = manager.pid + return true if pid <= 0 + + Process.signal(Signal::KILL, pid) + true rescue false end + def shutdown : Nil + managers = protocol_manager_lock.synchronize do + @driver_protocol_managers.values.dup + end + + managers.each do |manager| + begin + manager.terminate + rescue + nil + end + + pid = manager.pid + if pid > 0 + begin + Process.signal(Signal::KILL, pid) + rescue + nil + end + end + end + + protocol_manager_lock.synchronize do + @module_protocol_managers.clear + @driver_protocol_managers.clear + end + end + def debug(module_id : String, &on_message : DebugCallback) manager = protocol_manager_by_module?(module_id) raise ModuleError.new("No protocol manager for #{module_id}") if manager.nil? @@ -189,7 +231,14 @@ module PlaceOS::Core::ProcessManager::Common end def remove_driver_manager(key) - set_driver_protocol_manager(key, nil) + manager = set_driver_protocol_manager(key, nil) + return unless manager + + begin + manager.terminate + rescue + nil + end end private getter protocol_manager_lock = Mutex.new(protection: :reentrant) diff --git a/src/placeos-core/process_manager/edge.cr b/src/placeos-core/process_manager/edge.cr index fb30d683..bde3f3c9 100644 --- a/src/placeos-core/process_manager/edge.cr +++ b/src/placeos-core/process_manager/edge.cr @@ -12,13 +12,28 @@ module PlaceOS::Core alias Transport = PlaceOS::Edge::Transport alias Protocol = PlaceOS::Edge::Protocol + record RuntimeStatus, + connected : Bool, + last_seen : Time?, + snapshot_version : String?, + pending_updates : Int32, + pending_events : Int32, + last_event : String?, + last_error : String? do + include JSON::Serializable + end + getter transport : Transport getter edge_id : String + getter runtime_status : RuntimeStatus = RuntimeStatus.new(false, nil, nil, 0, 0, nil, nil) protected getter(store : DriverStore) { DriverStore.new } - def initialize(@edge_id : String, socket : HTTP::WebSocket) - @transport = Transport.new do |(sequence_id, request)| + def initialize(@edge_id : String, socket : HTTP::WebSocket, on_disconnect : Proc(Nil)? = nil) + disconnect_handler = on_disconnect || -> { disconnected! } + @transport = Transport.new(on_disconnect: ->(_error : IO::Error | HTTP::WebSocket::CloseCode) { + disconnect_handler.call + }) do |(sequence_id, request)| if request.is_a?(Protocol::Client::Request) handle_request(sequence_id, request) else @@ -26,8 +41,13 @@ module PlaceOS::Core end end - spawn { transport.listen(socket) } + spawn do + transport.listen(socket) + rescue IO::Error | Channel::ClosedError + nil + end Fiber.yield + update_runtime_status(connected: true, last_seen: Time.utc, last_event: "connected") end def handle_request(sequence_id : UInt64, request : Protocol::Client::Request) @@ -38,6 +58,17 @@ module PlaceOS::Core boolean_response(sequence_id, request) do forward_debug_message(request.module_id, request.message) end + when Protocol::Message::Heartbeat + boolean_response(sequence_id, request) do + update_runtime_status( + connected: true, + last_seen: request.timestamp, + snapshot_version: request.snapshot_version, + pending_updates: request.pending_updates, + pending_events: request.pending_events, + last_event: "heartbeat" + ) + end when Protocol::Message::FetchBinary response = fetch_binary(request.key) send_response(sequence_id, response) @@ -50,9 +81,6 @@ module PlaceOS::Core status_value: request.status_value, ) end - when Protocol::Message::Register - register_response = register(modules: request.modules, drivers: request.drivers) - send_response(sequence_id, register_response) when Protocol::Message::SettingsAction boolean_response(sequence_id, request) do on_setting( @@ -61,6 +89,17 @@ module PlaceOS::Core setting_value: YAML.parse(request.setting_value) ) end + when Protocol::Message::RuntimeEvent + boolean_response(sequence_id, request) do + update_runtime_status( + connected: true, + last_seen: Time.utc, + snapshot_version: request.snapshot_version, + pending_updates: request.backlog_depth, + last_event: request.kind, + last_error: request.kind == "sync_status" ? request.message : nil + ) + end end rescue e Log.error(exception: e) { { @@ -70,6 +109,8 @@ module PlaceOS::Core end def execute(module_id : String, payload : String, user_id : String?, mod : Model::Module? = nil) + raise PlaceOS::Driver::RemoteException.new("Edge #{edge_id} is not connected", "EdgeUnavailable", [] of String, 503) unless runtime_status.connected + response = Protocol.request(Protocol::Message::Execute.new(module_id, payload, user_id), expect: Protocol::Message::ExecuteResponse, preserve_response: true) if response.nil? raise PlaceOS::Driver::RemoteException.new("No response received from edge received", IO::TimeoutError.class.to_s) @@ -108,51 +149,6 @@ module PlaceOS::Core !!Protocol.request(Protocol::Message::Kill.new(ProcessManager.path_to_key(driver_key)), expect: Protocol::Message::Success) end - alias Module = Protocol::Message::RegisterResponse::Module - - # Calculates the modules/drivers that the edge needs to add/remove - # - protected def register(drivers : Set(String), modules : Set(String)) - allocated_drivers = Set(String).new - allocated_modules = Set(Module).new - edge_modules = {} of String => PlaceOS::Model::Module - - PlaceOS::Model::Module.on_edge(edge_id).each do |mod| - driver = mod.driver.not_nil! - mod_id = mod.id.as(String) - edge_modules[mod_id] = mod - driver_path = store.built?(driver.file_name, driver.commit, driver.repository!.branch, driver.repository!.uri) - if driver_path - driver_key = Path[driver_path].basename - allocated_modules << {key: driver_key, module_id: mod_id} - allocated_drivers << driver_key - else - Log.error { {message: "Executable for #{driver.id} not present", driver: driver.id, commit: driver.commit} } - end - end - - add_modules = allocated_modules.reject { |mod| modules.includes?(mod[:module_id]) } - remove_modules = (modules - allocated_modules.map(&.[:module_id])).to_a - - # After registering the modules we need to start them - should_start = [] of Tuple(String, String) - add_modules.each do |module_details| - module_id = module_details[:module_id] - mod = edge_modules[module_id] - next unless mod.running - should_start << {module_id, ModuleManager.start_payload(mod)} - end - - Protocol::Message::RegisterResponse.new( - success: true, - add_drivers: (allocated_drivers - drivers).to_a, - remove_drivers: (drivers - allocated_drivers).to_a, - add_modules: add_modules, - remove_modules: remove_modules, - running_modules: should_start - ) - end - # Callbacks ############################################################################################### @@ -318,6 +314,13 @@ module PlaceOS::Core t.send_request(request) end + def disconnected! + update_runtime_status(connected: false, last_seen: Time.utc, last_event: "disconnected") + if edge = PlaceOS::Model::Edge.find?(edge_id) + edge.update_fields(online: false, last_seen: Time.utc) + end + end + # Utilities ############################################################################################### @@ -328,5 +331,35 @@ module PlaceOS::Core message = match.pre_match unless match.nil? {message, exception} end + + private def update_runtime_status( + connected : Bool? = nil, + last_seen : Time? = nil, + snapshot_version : String? = nil, + pending_updates : Int32? = nil, + pending_events : Int32? = nil, + last_event : String? = nil, + last_error : String? = nil, + ) + current = @runtime_status + @runtime_status = RuntimeStatus.new( + connected: connected.nil? ? current.connected : connected, + last_seen: last_seen || current.last_seen, + snapshot_version: snapshot_version || current.snapshot_version, + pending_updates: pending_updates || current.pending_updates, + pending_events: pending_events || current.pending_events, + last_event: last_event || current.last_event, + last_error: last_error.nil? ? current.last_error : last_error + ) + + if edge = PlaceOS::Model::Edge.find?(edge_id) + edge.update_fields( + online: @runtime_status.connected, + last_seen: @runtime_status.last_seen + ) + end + + true + end end end diff --git a/src/placeos-core/process_manager/local.cr b/src/placeos-core/process_manager/local.cr index 39f757c9..5233ba70 100644 --- a/src/placeos-core/process_manager/local.cr +++ b/src/placeos-core/process_manager/local.cr @@ -11,6 +11,7 @@ module PlaceOS::Core private getter discovery : Clustering::Discovery private getter store : DriverStore + private getter module_manager : ModuleManager # Track active execute requests for lazy modules (module_id => count) private getter lazy_execute_counts : Hash(String, Atomic(Int32)) = {} of String => Atomic(Int32) @@ -19,7 +20,7 @@ module PlaceOS::Core # Track scheduled unload fibers to cancel them if new executions come in private getter lazy_unload_scheduled : Hash(String, Bool) = {} of String => Bool - def initialize(@discovery : Clustering::Discovery) + def initialize(@discovery : Clustering::Discovery, @module_manager : ModuleManager = Services.module_manager) @store = DriverStore.new end @@ -62,14 +63,22 @@ module PlaceOS::Core def execute(module_id : String, payload : String | IO, user_id : String?, mod : Model::Module? = nil) mod = mod || Model::Module.find?(module_id) - raise ModuleError.new("Could not locate module #{module_id}, no matching database record") unless mod + if mod + # Check if this is a lazy module that needs to be loaded + return execute_lazy(mod, payload, user_id) if mod.launch_on_execute + raise ModuleError.new("Could not locate module #{module_id}, it is stopped") unless mod.running + end - # Check if this is a lazy module that needs to be loaded - return execute_lazy(mod, payload, user_id) if mod.launch_on_execute - raise ModuleError.new("Could not locate module #{module_id}, it is stopped") unless mod.running + # The module should already be running and have a management module. + # If the backing ORM row has disappeared, still allow execution against the + # loaded module so in-flight runtimes remain operable. + manager = if existing = protocol_manager_by_module?(module_id) + existing + elsif mod + ensure_lazy_module_loaded(mod) + end - # the module should be running and have a management module - manager = protocol_manager_by_module?(module_id) || ensure_lazy_module_loaded(mod) + raise ModuleError.new("Could not locate module #{module_id}, no matching database record") if manager.nil? request_body = payload.is_a?(IO) ? payload.gets_to_end : payload manager.execute( module_id, @@ -117,8 +126,17 @@ module PlaceOS::Core return manager end - driver = mod.driver! - repository = driver.repository! + driver = mod.driver || mod.driver_id.try { |id| Model::Driver.find?(id) } + if driver.nil? + Log.debug { {message: "deferring lazy module load until driver exists", module_id: module_id, driver_id: mod.driver_id} } + raise ModuleError.new("Driver missing for lazy module #{module_id}") + end + + repository = driver.repository || driver.repository_id.try { |id| Model::Repository.find?(id) } + if repository.nil? + Log.debug { {message: "deferring lazy module load until repository exists", module_id: module_id, driver_id: driver.id} } + raise ModuleError.new("Repository missing for lazy module #{module_id}") + end driver_path = store.built?(driver.file_name, driver.commit, repository.branch, repository.uri) raise ModuleError.new("Driver not compiled for lazy module #{module_id}") if driver_path.nil? @@ -180,6 +198,7 @@ module PlaceOS::Core # Stop and unload the module stop(module_id) unload(module_id) + module_manager.register_lazy_module(mod) Log.info { {message: "unloaded lazy module after idle timeout", module_id: module_id, name: mod.name} } end @@ -234,7 +253,11 @@ module PlaceOS::Core ############################################################################################### def on_system_model(request : Request, response_callback : Request ->) - request.payload = PlaceOS::Model::ControlSystem.find!(request.id).to_json + if control_system = PlaceOS::Model::ControlSystem.find?(request.id) + request.payload = control_system.to_json + else + raise ModuleError.new("Could not locate control system #{request.id}") + end rescue error request.set_error(error) ensure @@ -242,17 +265,16 @@ module PlaceOS::Core end def on_exec(request : Request, response_callback : Request ->) - module_manager = ModuleManager.instance module_id = request.id manager, mod_orm = module_manager.process_manager(module_id) request = if manager.module_loaded?(module_id) - local_execute(request, module_id, mod_orm) + local_execute(manager, request, module_id, mod_orm) else core_uri = which_core(module_id) if core_uri == discovery.uri # If the module maps to this node - local_execute(request, module_id, mod_orm) + local_execute(manager, request, module_id, mod_orm) else # Otherwise, dial core node responsible for the module remote_execute(core_uri, request) @@ -302,8 +324,8 @@ module PlaceOS::Core request end - protected def local_execute(request, module_id, mod_orm) - response = execute(module_id, request.payload.as(String), request.user_id, mod_orm) || {"".as(String?), 500} + protected def local_execute(manager : ProcessManager, request, module_id, mod_orm) + response = manager.execute(module_id, request.payload.as(String), request.user_id) || {"".as(String?), 500} request.code = response[1] request.payload = response[0] request.cmd = :result @@ -332,7 +354,11 @@ module PlaceOS::Core # Used in `on_exec` for locating the remote module # def which_core(module_id : String) : URI - edge_id = Model::Module.find!(module_id).edge_id if Model::Module.has_edge_hint?(module_id) + edge_id = if Model::Module.has_edge_hint?(module_id) + mod = Model::Module.find?(module_id) + raise ModuleError.new("Could not locate module #{module_id}, no matching database record") unless mod + mod.edge_id + end node = edge_id ? discovery.find?(edge_id) : discovery.find?(module_id) raise Error.new("No registered core instances") if node.nil? node diff --git a/src/placeos-core/resource_manager.cr b/src/placeos-core/resource_manager.cr index b5b30f33..cb9ab42c 100644 --- a/src/placeos-core/resource_manager.cr +++ b/src/placeos-core/resource_manager.cr @@ -19,11 +19,25 @@ module PlaceOS::Core getter? started = false private getter start_lock = Mutex.new + @@instance_lock = Mutex.new @@instance : ResourceManager? def self.instance(testing = false) : ResourceManager - (@@instance ||= ResourceManager.new(testing: testing)).as(ResourceManager) + @@instance_lock.synchronize do + (@@instance ||= ResourceManager.new(testing: testing)).as(ResourceManager) + end + end + + def self.current_instance? : ResourceManager? + @@instance + end + + def self.reset_instance + @@instance_lock.synchronize do + @@instance.try &.stop + @@instance = nil + end end def initialize( @@ -68,6 +82,7 @@ module PlaceOS::Core @started = false driver_builder.stop control_system_modules.stop + driver_module_names.stop module_names.stop settings_updates.stop end diff --git a/src/placeos-core/services.cr b/src/placeos-core/services.cr new file mode 100644 index 00000000..9705985c --- /dev/null +++ b/src/placeos-core/services.cr @@ -0,0 +1,51 @@ +module PlaceOS::Core + module Services + extend self + + @@module_manager : ModuleManager? + @@resource_manager : ResourceManager? + @@module_manager_lock = Mutex.new + @@resource_manager_lock = Mutex.new + + def module_manager : ModuleManager + @@module_manager_lock.synchronize do + @@module_manager ||= ModuleManager.instance + end + end + + def resource_manager : ResourceManager + @@resource_manager_lock.synchronize do + @@resource_manager ||= ResourceManager.instance + end + end + + def module_manager=(manager : ModuleManager?) + @@module_manager_lock.synchronize do + @@module_manager = manager + end + end + + def resource_manager=(manager : ResourceManager?) + @@resource_manager_lock.synchronize do + @@resource_manager = manager + end + end + + def current_module_manager? : ModuleManager? + @@module_manager_lock.synchronize { @@module_manager } + end + + def current_resource_manager? : ResourceManager? + @@resource_manager_lock.synchronize { @@resource_manager } + end + + def reset + @@module_manager_lock.synchronize do + @@module_manager = nil + end + @@resource_manager_lock.synchronize do + @@resource_manager = nil + end + end + end +end diff --git a/src/placeos-core/settings_update.cr b/src/placeos-core/settings_update.cr index cf0f8d1a..9069b7a0 100644 --- a/src/placeos-core/settings_update.cr +++ b/src/placeos-core/settings_update.cr @@ -5,7 +5,7 @@ module PlaceOS class Core::SettingsUpdate < Resource(Model::Settings) private getter module_manager : ModuleManager - def initialize(@module_manager : ModuleManager = ModuleManager.instance) + def initialize(@module_manager : ModuleManager = Services.module_manager) super() end @@ -30,7 +30,7 @@ module PlaceOS result = Result::Success # Find each module affected by the Settings change - settings.dependent_modules.each do |mod| + dependent_modules(settings).each do |mod| begin if module_manager.refresh_module(mod) Log.info { {message: "#{mod.running_was == false ? "started" : "updated"} module with new settings", module_id: mod.id, settings_id: settings.id} } @@ -43,5 +43,29 @@ module PlaceOS result end + + private def self.dependent_modules(settings : Model::Settings) : Array(Model::Module) + model_id = settings.parent_id + model_type = settings.parent_type + return [] of Model::Module if model_id.nil? || model_type.nil? + + case model_type + in .module? + mod = Model::Module.find?(model_id) + mod ? [mod] : [] of Model::Module + in .driver? + Model::Module.by_driver_id(model_id).to_a + in .control_system? + Model::Module + .in_control_system(model_id) + .select(&.role.logic?) + .to_a + in .zone? + Model::Module + .in_zone(model_id) + .select(&.role.logic?) + .to_a + end + end end end diff --git a/src/placeos-edge/binary_manager.cr b/src/placeos-edge/binary_manager.cr new file mode 100644 index 00000000..db4997e6 --- /dev/null +++ b/src/placeos-edge/binary_manager.cr @@ -0,0 +1,102 @@ +require "http" +require "uri" +require "simple_retry" + +require "../placeos-core/driver_manager" + +module PlaceOS::Edge + class BinaryManager + Log = ::Log.for(self) + + DOWNLOAD_PATH = "/api/core/v1/edge/%{edge_id}/drivers/%{driver_key}" + MIN_FREE_SPACE = 100_000_000_i64 # 100MB minimum free space + + protected getter store : PlaceOS::Core::DriverStore + private getter edge_id : String + private getter base_uri : URI + private getter secret : String + + def initialize(@edge_id : String, @base_uri : URI, @secret : String, @store : PlaceOS::Core::DriverStore = PlaceOS::Core::DriverStore.new) + end + + def ensure_binary(driver_key : String, max_retries : Int32 = 3) + path = store.path(driver_key).to_s + return path if File.exists?(path) && valid_binary?(path) + + # Check available disk space before download + check_disk_space(path) + + temp_path = "#{path}.download" + + begin + SimpleRetry.try_to( + max_attempts: max_retries, + base_interval: 1.second, + max_interval: 5.seconds + ) do |attempt, error| + if error + Log.warn { "driver download retry #{attempt}/#{max_retries} for #{driver_key}: #{error.message}" } + File.delete(temp_path) rescue nil + end + + uri = base_uri.dup + uri.path = DOWNLOAD_PATH % {edge_id: edge_id, driver_key: URI.encode_path(driver_key)} + uri.query = URI::Params.encode({"api-key" => secret}) + + HTTP::Client.get(uri) do |response| + raise "failed to download driver #{driver_key}: #{response.status_code}" unless response.success? + + File.open(temp_path, mode: "w+", perm: File::Permissions.new(0o744)) do |file| + IO.copy(response.body_io, file) + end + end + + # Validate the downloaded binary + raise "invalid or corrupted binary for #{driver_key}" unless valid_binary?(temp_path) + + # Atomic move to final location + File.rename(temp_path, path) + end + rescue error + File.delete(temp_path) rescue nil + raise error + end + + path + end + + def delete_binary(driver_key : String) + path = store.path(driver_key).to_s + File.delete(path) if File.exists?(path) + rescue error + Log.error(exception: error) { "failed to delete binary #{driver_key}" } + end + + def compiled_drivers + store.compiled_drivers.to_set + end + + private def valid_binary?(path : String) : Bool + return false unless File.exists?(path) + return false unless File.size(path) > 0 + return false unless File.executable?(path) + true + rescue + false + end + + private def check_disk_space(path : String) + dir = File.dirname(path) + Dir.mkdir_p(dir) unless Dir.exists?(dir) + + {% if flag?(:linux) || flag?(:darwin) %} + stat = File.info(dir) + if stat.responds_to?(:free_space) && stat.free_space < MIN_FREE_SPACE + raise "insufficient disk space: #{stat.free_space} bytes available, need at least #{MIN_FREE_SPACE}" + end + {% end %} + rescue ex : File::Error + Log.warn(exception: ex) { "unable to check disk space for #{dir}" } + end + end +end diff --git a/src/placeos-edge/client.cr b/src/placeos-edge/client.cr index 84a02be4..770916e7 100644 --- a/src/placeos-edge/client.cr +++ b/src/placeos-edge/client.cr @@ -1,56 +1,63 @@ require "simple_retry" -require "rwlock" require "uri" require "placeos-driver/protocol/management" -require "../placeos-core/process_manager/common" require "../placeos-core/driver_manager" +require "./binary_manager" require "./constants" +require "./desired_state_client" require "./protocol" -require "./transport" +require "./realtime_channel" +require "./reconciler" +require "./runtime_manager" +require "./runtime_store" +require "./state" module PlaceOS::Edge class Client - include Core::ProcessManager::Common - Log = ::Log.for(self) - WEBSOCKET_API_PATH = "/api/engine/v2/edges/control" + WEBSOCKET_API_PATH = "/api/core/v1/edge/control" - protected getter store : Core::DriverStore + protected getter store : PlaceOS::Core::DriverStore + getter runtime_store : RuntimeStore + getter runtime_manager : RuntimeManager + @binary_manager : BinaryManager? = nil + @desired_state : DesiredStateClient? = nil + @reconciler : Reconciler? = nil private getter secret : String - + private getter edge_id : String + private getter poll_interval : Time::Span private getter! uri : URI - protected getter! transport : Transport + @realtime : RealtimeChannel? = nil - # NOTE: For testing purposes + # NOTE: injected socket controls are only for deterministic spec coverage of + # realtime transport behavior; production flow uses the normal websocket path. private getter? skip_handshake : Bool private getter? ping : Bool + private getter? sync_injected_socket : Bool private getter close_channel = Channel(Nil).new - - # structures for tracking what has been loaded and what has been requested - # this allows us do some of these things out of order when they become available - @loading_mutex = Mutex.new(:reentrant) - # driver_key => downloaded signal - @loading_driver_keys = {} of String => Channel(Nil) - # driver_key => [mod_ids] - @loading_modules = Hash(String, Array(String)).new { |hash, key| hash[key] = [] of String } - # module_id => payload - @pending_start = {} of String => String - - getter host : String { uri.to_s.gsub(uri.request_target, "") } + @connect_sync_count = Atomic(Int32).new(0) + @injected_socket_mode : Bool = false def initialize( uri : URI = PLACE_URI, secret : String? = nil, - @sequence_id : UInt64 = 0, + @edge_id : String = EDGE_ID, @skip_handshake : Bool = false, @ping : Bool = true, - @store = Core::DriverStore.new, + @sync_injected_socket : Bool = false, + @store : PlaceOS::Core::DriverStore = PlaceOS::Core::DriverStore.new, + @runtime_store : RuntimeStore = RuntimeStore.new, + @poll_interval : Time::Span = SNAPSHOT_POLL_INTERVAL, ) + # Validate configuration + raise ArgumentError.new("edge_id cannot be empty") if @edge_id.empty? + raise ArgumentError.new("poll_interval must be positive") if @poll_interval <= Time::Span.zero + @secret = if secret && secret.presence secret else @@ -58,80 +65,55 @@ module PlaceOS::Edge CLIENT_SECRET end - # Mutate a copy as secret is embedded in uri - uri = uri.dup - uri.path = WEBSOCKET_API_PATH - uri.query = "api-key=#{@secret}" - @uri = uri - end + raise ArgumentError.new("secret cannot be empty") if @secret.empty? - alias ModuleError = ::PlaceOS::Core::ModuleError + @uri = uri.dup - # Implement the abstract method from Common - def execute(module_id : String, payload : String | IO, user_id : String?, mod : Model::Module? = nil) - manager = protocol_manager_by_module?(module_id) - - raise ModuleError.new("No protocol manager for #{module_id}") if manager.nil? + @runtime_manager = RuntimeManager.new( + store: store, + on_setting_callback: ->on_setting(String, String, String), + on_redis_callback: ->on_redis(Protocol::RedisAction, String, String, String?) + ) - request_body = payload.is_a?(IO) ? payload.gets_to_end : payload - manager.execute( - module_id, - request_body, - user_id: user_id, + @binary_manager = binary_manager = BinaryManager.new(edge_id, self.uri, @secret, @store) + @desired_state = DesiredStateClient.new(edge_id, self.uri, @secret) + @reconciler = Reconciler.new( + runtime_store: @runtime_store, + binary_manager: binary_manager, + runtime_manager: @runtime_manager, + on_event: ->send_runtime_event(State::RuntimeEvent) ) - rescue error : PlaceOS::Driver::RemoteException - raise error - rescue exception - raise module_error(module_id, exception) end - # Initialize the WebSocket API - # - # Optionally accepts a block called after connection has been established. + # Initialize the WebSocket API and desired-state polling loop. def connect(initial_socket : HTTP::WebSocket? = nil, &) - Log.info { "connecting to #{host}" } - - @transport = Transport.new( - on_disconnect: ->(_error : HTTP::WebSocket::CloseCode | IO::Error) { - Log.debug { "core connection lost. Cleaning up pending operations" } - - @loading_mutex.synchronize do - @loading_driver_keys.each { |_driver_key, channel| channel.close } - @loading_driver_keys = {} of String => Channel(Nil) - @loading_modules = Hash(String, Array(String)).new { |hash, key| hash[key] = [] of String } - @pending_start = {} of String => String - end - nil - }, - on_connect: -> { - handshake unless skip_handshake? - nil - } - ) do |(sequence_id, request)| - if request.is_a?(Protocol::Server::Request) - handle_request(sequence_id, request) + Log.info { "connecting to #{uri}" } + @injected_socket_mode = !initial_socket.nil? + + channel = RealtimeChannel.new(uri, secret, edge_id, ping? || false) + @realtime = channel + channel.connect( + initial_socket, + on_disconnect: ->on_disconnect(IO::Error | HTTP::WebSocket::CloseCode), + on_connect: ->on_connect + ) do |message| + if request = message[1].as?(Protocol::Server::Request) + handle_request(message[0], request) else - Log.error { {message: "unexpected core request", request: request.to_json} } + Log.error { {message: "unexpected core request", request: message[1].to_json} } end end - spawn { transport.connect(uri, initial_socket) } + spawn { desired_state_loop } unless skip_handshake? || injected_socket_mode? - while transport.closed? - sleep 10.milliseconds - Fiber.yield - end - - # Send ping frames - spawn { transport.ping if ping? } + load_persisted_snapshot unless skip_handshake? || injected_socket_mode? yield close_channel.receive? - transport.disconnect + realtime?.try &.disconnect end - # :ditto: def connect(initial_socket : HTTP::WebSocket? = nil) connect(initial_socket) { } end @@ -145,85 +127,44 @@ module PlaceOS::Edge case request in Protocol::Message::Debug - boolean_command(sequence_id, request) do - debug(request.module_id) - end + boolean_command(sequence_id, request) { debug(request.module_id) } in Protocol::Message::DriverLoaded - boolean_command(sequence_id, request) do - driver_loaded?(request.driver_key) - end + boolean_command(sequence_id, request) { runtime_manager.driver_loaded?(request.driver_key) } in Protocol::Message::DriverStatus - status = driver_status(request.driver_key) - send_response(sequence_id, Protocol::Message::DriverStatusResponse.new(status)) + send_response(sequence_id, Protocol::Message::DriverStatusResponse.new(runtime_manager.driver_status(request.driver_key))) in Protocol::Message::Execute success, output, response_code = begin - result = execute( - request.module_id, - request.payload, - user_id: request.user_id, - ) - - ({true, result[0], result[1]}) + result = runtime_manager.execute(request.module_id, request.payload, user_id: request.user_id) + {true, result[0], result[1]} rescue error : PlaceOS::Driver::RemoteException - Log.error(exception: error) { { - module_id: request.module_id, - message: "execute errored", - } } - ({false, {message: error.message, backtrace: error.backtrace?, code: error.code}.to_json, error.code}) + Log.error(exception: error) { {module_id: request.module_id, message: "execute errored"} } + {false, {message: error.message, backtrace: error.backtrace?, code: error.code}.to_json, error.code} end send_response(sequence_id, Protocol::Message::ExecuteResponse.new(success, output, response_code)) in Protocol::Message::Ignore - boolean_command(sequence_id, request) do - ignore(request.module_id) - end + boolean_command(sequence_id, request) { ignore(request.module_id) } in Protocol::Message::Kill - boolean_command(sequence_id, request) do - kill(request.driver_key) - end + boolean_command(sequence_id, request) { runtime_manager.kill(request.driver_key) } in Protocol::Message::Load - boolean_command(sequence_id, request) do - # @loading_mutex.synchronize do - # File.delete(path(request.driver_key)) if !protocol_manager_by_driver?(request.driver_key) && File.exists?(path(request.driver_key)) - # end - load(request.module_id, request.driver_key) - end + boolean_command(sequence_id, request) { runtime_manager.load(request.module_id, request.driver_key) } in Protocol::Message::LoadedModules - send_response(sequence_id, Protocol::Message::LoadedModulesResponse.new(loaded_modules)) + send_response(sequence_id, Protocol::Message::LoadedModulesResponse.new(runtime_manager.loaded_modules)) in Protocol::Message::ModuleLoaded - boolean_command(sequence_id, request) do - module_loaded?(request.module_id) - end + boolean_command(sequence_id, request) { runtime_manager.module_loaded?(request.module_id) } in Protocol::Message::RunCount - send_response(sequence_id, run_count_message) + send_response(sequence_id, Protocol::Message::RunCountResponse.new(count: runtime_manager.run_count)) in Protocol::Message::Start - boolean_command(sequence_id, request) do - queue_start(request.module_id, request.payload) - end + boolean_command(sequence_id, request) { runtime_manager.start(request.module_id, request.payload) } in Protocol::Message::Stop - boolean_command(sequence_id, request) do - @loading_mutex.synchronize do - @pending_start.delete(request.module_id) - stop(request.module_id) - end - end + boolean_command(sequence_id, request) { runtime_manager.stop(request.module_id) } in Protocol::Message::SystemStatus - send_response(sequence_id, Protocol::Message::SystemStatusResponse.new(system_status)) + send_response(sequence_id, Protocol::Message::SystemStatusResponse.new(runtime_manager.system_status)) in Protocol::Message::Unload boolean_command(sequence_id, request) do - @loading_mutex.synchronize do - @pending_start.delete(request.module_id) - if driver_key = driver_key_for?(request.module_id) - if modules = @loading_modules[driver_key]? - modules.delete(request.module_id) - if modules.empty? && (channel = @loading_driver_keys.delete(driver_key)) - # abort downloading of driver - channel.close - end - end - end - unload(request.module_id) - end + runtime_manager.unload(request.module_id) + runtime_store.delete_runtime_module(request.module_id) + true end in Protocol::Message::Body Log.warn { {message: "unexpected message in handle request", type: request.type.to_s} } @@ -232,252 +173,40 @@ module PlaceOS::Edge Log.error(exception: e) { {message: "failed to handle core request", request: request.to_json} } end - def handshake - SimpleRetry.try_to(base_interval: 500.milliseconds, max_interval: 5.seconds) do - begin - response = Protocol.request(registration_message, expect: Protocol::Message::RegisterResponse) - unless response - Log.warn { "failed to register to core" } - raise "handshake failed" - end - - response.remove_modules.each do |mod| - unload(mod) - end - - response.remove_drivers.each do |driver| - remove_binary(driver) - end - - load_binaries(response.add_drivers) - - response.add_modules.each do |mod| - load(mod[:module_id], mod[:key]) - end - - response.running_modules.each do |(module_id, payload)| - queue_start(module_id, payload) - end - - Log.info { "handshake success, edge registered" } - rescue error - Log.error(exception: error) { "during handshake" } - raise error - end - end - end - - def queue_start(module_id : String, payload : String) - @loading_mutex.synchronize do - if protocol_manager_by_module?(module_id) - start(module_id, payload) - else - @pending_start[module_id] = payload - end - end - end - - # Kicks off downloading all the binaries - def load_binaries(binaries : Array(String)) - promises = binaries.map do |driver_key| - File.delete(path(driver_key)) if File.exists?(path(driver_key)) - Promise.defer do - if wait_load = load_binary(driver_key) - select - when wait_load.receive? - when timeout(90.seconds) - Log.error { "timeout loading #{driver_key}" } - end - end - end - end - - Promise.all(promises).get - end - - # Message - ########################################################################### - - # Extracts the running modules and drivers on the edge - # - protected def registration_message : Protocol::Message::Register - Protocol::Message::Register.new( - modules: modules, - drivers: drivers, - ) - end - - protected def run_count_message : Protocol::Message::RunCountResponse - Protocol::Message::RunCountResponse.new(count: run_count) - end - - # Driver binaries - ########################################################################### - - # List the driver binaries present on this client - # def drivers - store.compiled_drivers.to_set + binary_manager.compiled_drivers end - # Load binary, first checking if present locally then fetch from core - # - def load_binary(key : String) : Channel(Nil)? - perform_load = true - loaded_channel = Channel(Nil).new - - @loading_mutex.synchronize do - Log.debug { {key: key, message: "loading binary"} } - - if loading = @loading_driver_keys[key]? - perform_load = false - loaded_channel = loading - else - return if File.exists?(path(key)) - @loading_driver_keys[key] = loaded_channel - end - end - - return loaded_channel unless perform_load - spawn { attempt_download(loaded_channel, key) } - - loaded_channel + def driver_loaded?(driver_key : String) : Bool + runtime_manager.driver_loaded?(driver_key) end - def attempt_download(loaded_channel, key) - binary = SimpleRetry.try_to(base_interval: 5.seconds, max_interval: 30.seconds) do - result = fetch_binary(key) unless loaded_channel.closed? - raise "retry" if result.nil? && !loaded_channel.closed? && @loading_driver_keys[key]? == loaded_channel - result - end - - @loading_mutex.synchronize do - if !loaded_channel.closed? - # write the executable - if binary - add_binary(key, binary) - end - - # signal that we're ready to run - loaded_channel.close - @loading_driver_keys.delete(key) - - # load any requests that have come in the mean time - if pending = @loading_modules.delete(key) - pending.each do |module_id| - load(module_id, key) - if payload = @pending_start.delete(module_id) - start(module_id, payload) - end - end - end - end - end - rescue error - Log.error(exception: error) { "error during download attempt" } - spawn { attempt_download(loaded_channel, key) } unless loaded_channel.closed? + def module_loaded?(module_id : String) : Bool + runtime_manager.module_loaded?(module_id) end - def fetch_binary(key : String) : IO? - response = Protocol.request(Protocol::Message::FetchBinary.new(key), expect: Protocol::Message::BinaryBody) - response.try &.io + def driver_status(driver_key : String) + runtime_manager.driver_status(driver_key) end - def add_binary(key : String, binary : IO) - path = path(key) - File.delete(path) if File.exists?(path) - Log.debug { {path: path, message: "writing binary"} } - - # Default permissions + execute for owner - File.open(path, mode: "w+", perm: File::Permissions.new(0o744)) do |file| - IO.copy(binary, file) - end + def loaded_modules + runtime_manager.loaded_modules end - def remove_binary(key : String) - @loading_mutex.synchronize do - # clean up any pending operations - if loading = @loading_driver_keys.delete(key) - loading.close - end - if pending = @loading_modules.delete(key) - pending.each { |module_id| @pending_start.delete(module_id) } - end - File.delete(path(key)) - end - true - rescue - false + def run_count + runtime_manager.run_count end - protected def path(key : String) - store.path(key).to_s + def apply_snapshot(snapshot : State::Snapshot) + reconciler.apply(snapshot) end - # Modules - ########################################################################### - - # Check for binary, request if it's not present - # Start the module with redis hooks - def load(module_id, driver_key) - Log.context.set(module_id: module_id, driver_key: driver_key) - - if !protocol_manager_by_module?(module_id) - if existing_driver_manager = protocol_manager_by_driver?(driver_key) - # Use the existing driver protocol manager - set_module_protocol_manager(module_id, existing_driver_manager) - else - if wait_load = load_binary(driver_key) - select - when wait_load.receive? - @loading_mutex.synchronize do - unless File.exists?(path(driver_key)) - Log.info { "module load aborted" } - return - end - end - when timeout(20.seconds) - @loading_mutex.synchronize do - # ensure we are still loading this - if @loading_driver_keys[driver_key]? - @loading_modules[driver_key] << module_id - Log.info { "queuing module load" } - return - end - end - end - end - - # Create a new protocol manager - manager = Driver::Protocol::Management.new(path(driver_key), on_edge: true) - - # Callbacks - manager.on_setting = ->(id : String, setting_name : String, setting_value : YAML::Any) { - Log.debug { {module_id: module_id, driver_key: driver_key, message: "on_setting"} } - on_setting(id, setting_name, setting_value.to_yaml) - } - - manager.on_redis = ->(action : Protocol::RedisAction, hash_id : String, key_name : String, status_value : String?) { - Log.debug { {module_id: module_id, driver_key: driver_key, action: action.to_s, message: "on_redis"} } - on_redis(action, hash_id, key_name, status_value) - } - - set_module_protocol_manager(module_id, manager) - set_driver_protocol_manager(driver_key, manager) - end - - Log.info { "module loaded" } - else - Log.info { "module already loaded" } - end + def protocol_manager_by_driver?(driver_key : String) + runtime_manager.protocol_manager_by_driver?(driver_key) end - # List the modules running on this client - # - def modules - protocol_manager_lock.synchronize do - @module_protocol_managers.keys.to_set - end + def protocol_manager_by_module?(module_id : String) + runtime_manager.protocol_manager_by_module?(module_id) end # Debugging @@ -491,7 +220,7 @@ module PlaceOS::Edge unless debug_callbacks.has_key?(module_id) callback = ->(message : String) { forward_debug_message(module_id, message); nil } debug_callbacks[module_id] = callback - protocol_manager_by_module?(module_id).try &.debug(module_id, &callback) + runtime_manager.debug(module_id, &callback) end end end @@ -499,18 +228,21 @@ module PlaceOS::Edge def ignore(module_id : String) debug_lock.synchronize do callback = debug_callbacks.delete(module_id) - protocol_manager_by_module?(module_id).try &.ignore(module_id, &callback) unless callback.nil? + runtime_manager.ignore(module_id, &callback) unless callback.nil? end end def forward_debug_message(module_id, message) - send_request(Protocol::Message::DebugMessage.new(module_id, message)) + spawn do + send_event(Protocol::Message::DebugMessage.new(module_id, message)) + rescue error + Log.error(exception: error) { {message: "forward_debug_message errored", module_id: module_id} } + end end - # Module Callbacks + # Edge-originated sync ########################################################################### - # Proxy a settings write via Core def on_setting(module_id : String, setting_name : String, setting_value : String) request = Protocol::Message::SettingsAction.new( module_id: module_id, @@ -521,23 +253,127 @@ module PlaceOS::Edge Protocol.request(request, expect: Protocol::Message::Success) end - # Proxy a redis action via Core def on_redis(action : Protocol::RedisAction, hash_id : String, key_name : String, status_value : String?) + update = runtime_store.queue_update(action, hash_id, key_name, status_value) + flush_update(update) if connected? + end + + private def flush_pending_updates + runtime_store.pending_updates.each do |update| + flush_update(update) + end + end + + private def flush_pending_events + runtime_store.pending_events.each do |pending| + flush_event(pending) + end + end + + private def flush_update(update : State::PendingRedisUpdate) + return unless connected? + return if injected_socket_mode? && !sync_injected_socket? + request = Protocol::Message::ProxyRedis.new( - action: action, - hash_id: hash_id, - key_name: key_name, - status_value: status_value, + action: update.action, + hash_id: update.hash_id, + key_name: update.key_name, + status_value: update.status_value, + ) + + response = Protocol.request(request, expect: Protocol::Message::Success) + runtime_store.acknowledge_update(update.id) if response.try(&.success) + end + + private def send_runtime_event(event : State::RuntimeEvent) + pending = runtime_store.queue_event(event) + flush_event(pending) if connected? + end + + private def flush_event(pending : State::PendingRuntimeEvent) + return unless connected? + return if injected_socket_mode? && !sync_injected_socket? + + event = pending.event + + request = Protocol::Message::RuntimeEvent.new( + kind: event.kind.to_s.underscore, + module_id: event.module_id, + driver_key: event.driver_key, + message: event.message, + snapshot_version: event.snapshot_version, + backlog_depth: event.backlog_depth + ) + + response = Protocol.request(request, expect: Protocol::Message::Success) + runtime_store.acknowledge_event(pending.id) if response.try(&.success) + end + + private def send_heartbeat + return unless connected? + return if injected_socket_mode? && !sync_injected_socket? + + request = Protocol::Message::Heartbeat.new( + timestamp: Time.utc, + snapshot_version: runtime_store.last_snapshot_version, + pending_updates: runtime_store.pending_update_count, + pending_events: runtime_store.pending_event_count ) Protocol.request(request, expect: Protocol::Message::Success) end + private def on_connect + flush_pending_updates + flush_pending_events + send_heartbeat + @connect_sync_count.add(1) + nil + end + + private def on_disconnect(_error : IO::Error | HTTP::WebSocket::CloseCode) + nil + end + + private def connected? + realtime?.try(&.closed?) == false + end + + # Desired state reconciliation + ########################################################################### + + private def desired_state_loop + last_modified = runtime_store.snapshot.try(&.last_modified) + + until close_channel.closed? + begin + if snapshot = desired_state.fetch(last_modified) + # Don't hold lock during reconciliation - it can take minutes + reconciler.apply(snapshot) + last_modified = snapshot.last_modified + end + rescue error + runtime_store.set_last_error(error.message) + send_runtime_event(State::RuntimeEvent.new(:sync_status, message: error.message, snapshot_version: runtime_store.last_snapshot_version, backlog_depth: runtime_store.pending_update_count)) + ensure + send_heartbeat if connected? + end + + sleep poll_interval + end + end + + private def load_persisted_snapshot + if snapshot = runtime_store.snapshot + reconciler.apply(snapshot) + end + rescue error + runtime_store.set_last_error(error.message) + end + # Transport ########################################################################### - # Bundles up the result of a command into a `Success` response - # protected def boolean_command(sequence_id, request, &) success = begin result = yield @@ -551,15 +387,41 @@ module PlaceOS::Edge end protected def send_response(sequence_id : UInt64, response : Protocol::Client::Response | Protocol::Message::Success) - t = transport? - raise "cannot send response over closed transport" if t.nil? - t.send_response(sequence_id, response) + channel = realtime? + raise "cannot send response over closed transport" if channel.nil? + channel.send_response(sequence_id, response) end protected def send_request(request : Protocol::Client::Request) - t = transport? - raise "cannot send request over closed transport" if t.nil? - t.send_request(request) + channel = realtime? + raise "cannot send request over closed transport" if channel.nil? + channel.send_request(request) + end + + protected def send_event(request : Protocol::Client::Request) + channel = realtime? + raise "cannot send event over closed transport" if channel.nil? + channel.send_event(request) + end + + private def realtime? + @realtime + end + + private def injected_socket_mode? + @injected_socket_mode + end + + private def binary_manager + @binary_manager.not_nil! + end + + private def desired_state + @desired_state.not_nil! + end + + private def reconciler + @reconciler.not_nil! end end end diff --git a/src/placeos-edge/constants.cr b/src/placeos-edge/constants.cr index e6ddaab3..0d77327b 100644 --- a/src/placeos-edge/constants.cr +++ b/src/placeos-edge/constants.cr @@ -6,9 +6,15 @@ module PlaceOS::Edge # Secret used to register with PlaceOS CLIENT_SECRET = ENV["PLACE_EDGE_KEY"]? || (production? ? abort("missing PLACE_EDGE_KEY in environment") : "edge-1000_secret") + EDGE_ID = ENV["PLACE_EDGE_ID"]? || "placeos-edge" # URI of PlaceOS instance - PLACE_URI = URI.parse(ENV["PLACE_URI"]? || "https://localhost:8443".tap { |v| Log.warn { "missing PLACE_URI in environment, using #{v}" } }) + PLACE_URI = URI.parse(ENV["PLACE_URI"]? || "https://localhost:8443".tap { |v| Log.warn { "missing PLACE_URI in environment, using #{v}" } }) + SNAPSHOT_POLL_INTERVAL = (ENV["PLACE_EDGE_POLL_INTERVAL"]?.try(&.to_i?) || 5).seconds + + # Backpressure limits for offline operation + MAX_PENDING_UPDATES = (ENV["PLACE_EDGE_MAX_PENDING_UPDATES"]?.try(&.to_i?) || 10_000) + MAX_PENDING_EVENTS = (ENV["PLACE_EDGE_MAX_PENDING_EVENTS"]?.try(&.to_i?) || 1_000) PROD = ENV["SG_ENV"]? == "production" diff --git a/src/placeos-edge/desired_state_client.cr b/src/placeos-edge/desired_state_client.cr new file mode 100644 index 00000000..4538b0ac --- /dev/null +++ b/src/placeos-edge/desired_state_client.cr @@ -0,0 +1,32 @@ +require "http" +require "uri" + +require "./state" + +module PlaceOS::Edge + class DesiredStateClient + SNAPSHOT_PATH = "/api/core/v1/edge/%{edge_id}/desired_state" + + private getter edge_id : String + private getter base_uri : URI + private getter secret : String + + def initialize(@edge_id : String, @base_uri : URI, @secret : String) + end + + def fetch(last_modified : Time? = nil) : State::Snapshot? + uri = base_uri.dup + uri.path = SNAPSHOT_PATH % {edge_id: edge_id} + uri.query = URI::Params.encode({"api-key" => secret}) + + headers = HTTP::Headers.new + headers["If-Modified-Since"] = HTTP.format_time(last_modified) if last_modified + + response = HTTP::Client.get(uri, headers: headers) + return nil if response.status_code == 304 + raise "failed to fetch desired state: #{response.status_code}" unless response.success? + + State::Snapshot.from_json(response.body) + end + end +end diff --git a/src/placeos-edge/protocol.cr b/src/placeos-edge/protocol.cr index 2a8434e1..4737923c 100644 --- a/src/placeos-edge/protocol.cr +++ b/src/placeos-edge/protocol.cr @@ -121,17 +121,15 @@ module PlaceOS::Edge::Protocol Unload # Success # -> Client - Register ProxyRedis # Success FetchBinary SettingsAction # Success + RuntimeEvent # Success + Heartbeat # Success # Response Success - # -> Server - RegisterResponse - # -> Client DebugMessage DriverStatusResponse @@ -299,14 +297,6 @@ module PlaceOS::Edge::Protocol end end - struct Register < Client::Request - getter modules : Set(String) - getter drivers : Set(String) - - def initialize(@modules, @drivers) - end - end - struct SettingsAction < Client::Request getter module_id : String getter setting_name : String @@ -316,6 +306,29 @@ module PlaceOS::Edge::Protocol end end + struct RuntimeEvent < Client::Request + getter kind : String + getter module_id : String? + getter driver_key : String? + getter message : String? + getter snapshot_version : String? + getter backlog_depth : Int32? + + def initialize(@kind, @module_id = nil, @driver_key = nil, @message = nil, @snapshot_version = nil, @backlog_depth = nil) + end + end + + struct Heartbeat < Client::Request + @[JSON::Field(converter: Time::EpochConverter)] + getter timestamp : Time + getter snapshot_version : String? + getter pending_updates : Int32 + getter pending_events : Int32 + + def initialize(@timestamp = Time.utc, @snapshot_version = nil, @pending_updates = 0, @pending_events = 0) + end + end + # Responses ############################################################################################### @@ -333,7 +346,7 @@ module PlaceOS::Edge::Protocol abstract struct ::PlaceOS::Edge::Protocol::Server::Response < ::PlaceOS::Edge::Protocol::Message::ResponseBody end - struct Success < ResponseBody + struct Success < Client::Response def initialize(@success) end end @@ -391,26 +404,6 @@ module PlaceOS::Edge::Protocol def initialize(@success, @key, @path = nil, @io = nil) end end - - struct RegisterResponse < Server::Response - getter add_drivers : Array(String) - getter remove_drivers : Array(String) - getter add_modules : Array(Module) - getter remove_modules : Array(String) - getter running_modules : Array(Tuple(String, String)) - - alias Module = NamedTuple(key: String, module_id: String) - - def initialize( - @success, - @add_drivers = [] of String, - @remove_drivers = [] of String, - @add_modules = [] of Module, - @remove_modules = [] of String, - @running_modules = [] of Tuple(String, String), - ) - end - end end alias RedisAction = ::PlaceOS::Driver::Protocol::Management::RedisAction diff --git a/src/placeos-edge/realtime_channel.cr b/src/placeos-edge/realtime_channel.cr new file mode 100644 index 00000000..abe799f0 --- /dev/null +++ b/src/placeos-edge/realtime_channel.cr @@ -0,0 +1,87 @@ +require "./protocol" +require "./transport" + +module PlaceOS::Edge + class RealtimeChannel + Log = ::Log.for(self) + + private getter uri : URI + private getter secret : String + private getter edge_id : String + private getter ping_enabled : Bool + private getter! transport : Transport + @disconnecting = Atomic(Bool).new(false) + + def initialize(@uri : URI, @secret : String, @edge_id : String, @ping_enabled : Bool = true) + end + + def connect( + initial_socket : HTTP::WebSocket? = nil, + on_disconnect : (IO::Error | HTTP::WebSocket::CloseCode ->)? = nil, + on_connect : Proc(Nil)? = nil, + &on_request : {UInt64, Protocol::Request} -> + ) + @disconnecting.set(false) + socket_uri = uri.dup + socket_uri.path = Client::WEBSOCKET_API_PATH + socket_uri.query = URI::Params.encode({"api-key" => secret, "edge_id" => edge_id}) + + @transport = Transport.new( + on_disconnect: on_disconnect, + on_connect: on_connect, + ) do |message| + on_request.call(message) + end + + spawn do + begin + transport.connect(socket_uri, initial_socket) + rescue IO::Error | Channel::ClosedError + nil + rescue error + Log.error(exception: error) { "realtime channel connect failed" } unless transport.closed? || @disconnecting.get + end + end + + while transport.closed? + sleep 10.milliseconds + Fiber.yield + end + + if ping_enabled + spawn do + begin + transport.ping + rescue IO::Error | Channel::ClosedError + nil + rescue error + Log.error(exception: error) { "realtime channel ping failed" } unless transport.closed? || @disconnecting.get + end + end + end + end + + def disconnect + @disconnecting.set(true) + transport.disconnect unless transport.closed? + rescue + nil + end + + def closed? + transport?.try(&.closed?) != false + end + + def send_request(request : Protocol::Client::Request) + transport.send_request(request) + end + + def send_event(request : Protocol::Client::Request) + transport.send_event(request) + end + + def send_response(sequence_id : UInt64, response : Protocol::Client::Response | Protocol::Message::Success) + transport.send_response(sequence_id, response) + end + end +end diff --git a/src/placeos-edge/reconciler.cr b/src/placeos-edge/reconciler.cr new file mode 100644 index 00000000..ca97951e --- /dev/null +++ b/src/placeos-edge/reconciler.cr @@ -0,0 +1,145 @@ +require "./state" +require "./runtime_store" +require "./binary_manager" +require "./runtime_manager" + +module PlaceOS::Edge + class Reconciler + Log = ::Log.for(self) + + private getter runtime_store : RuntimeStore + private getter binary_manager : BinaryManager + private getter runtime_manager : RuntimeManager + private getter on_event : (State::RuntimeEvent ->)? + + def initialize( + @runtime_store : RuntimeStore, + @binary_manager : BinaryManager, + @runtime_manager : RuntimeManager, + @on_event : (State::RuntimeEvent ->)? = nil, + ) + end + + def apply(snapshot : State::Snapshot) + desired_modules = snapshot.modules.index_by(&.module_id) + current_modules = runtime_store.runtime_modules + + desired_driver_keys = snapshot.drivers.map(&.key).to_set + snapshot.modules.each do |mod| + desired_driver_keys << mod.driver_key + end + + failed_drivers = [] of String + failed_modules = [] of String + + # Download drivers with individual error handling for partial success + desired_driver_keys.each do |driver_key| + begin + binary_manager.ensure_binary(driver_key) + emit(State::RuntimeEvent.new(:driver_ready, driver_key: driver_key)) + rescue error + failed_drivers << driver_key + Log.error(exception: error) { "failed to download driver #{driver_key}" } + emit(State::RuntimeEvent.new(:module_failed, driver_key: driver_key, message: "driver download failed: #{error.message}")) + end + end + + # Unload modules that are no longer desired + current_modules.each_key do |module_id| + next if desired_modules.has_key?(module_id) + begin + unload_module(module_id, current_modules[module_id]) + rescue error + Log.error(exception: error) { "failed to unload module #{module_id}" } + end + end + + # Reconcile modules individually, skipping those with failed drivers + snapshot.modules.each do |desired| + if failed_drivers.includes?(desired.driver_key) + failed_modules << desired.module_id + Log.warn { "skipping module #{desired.module_id} due to failed driver #{desired.driver_key}" } + next + end + + begin + reconcile_module(desired, current_modules[desired.module_id]?) + rescue error + failed_modules << desired.module_id + Log.error(exception: error) { "failed to reconcile module #{desired.module_id}" } + end + end + + # Clean up unused drivers + (binary_manager.compiled_drivers - desired_driver_keys).each do |driver_key| + next if runtime_manager.driver_loaded?(driver_key) + begin + binary_manager.delete_binary(driver_key) + emit(State::RuntimeEvent.new(:driver_removed, driver_key: driver_key)) + rescue error + Log.error(exception: error) { "failed to delete driver binary #{driver_key}" } + end + end + + # Always save snapshot even with partial failures + runtime_store.save_snapshot(snapshot) + + # Report overall reconciliation status + if failed_drivers.any? || failed_modules.any? + error_msg = "Partial reconciliation: #{failed_drivers.size} driver(s) failed, #{failed_modules.size} module(s) failed" + runtime_store.set_last_error(error_msg) + emit(State::RuntimeEvent.new(:snapshot_applied, snapshot_version: snapshot.version, backlog_depth: runtime_store.pending_update_count, message: error_msg)) + else + runtime_store.set_last_error(nil) + emit(State::RuntimeEvent.new(:snapshot_applied, snapshot_version: snapshot.version, backlog_depth: runtime_store.pending_update_count)) + end + end + + private def reconcile_module(desired : State::DesiredModule, current : State::RuntimeModule?) + runtime_manager.load(desired.module_id, desired.driver_key) + + runtime = current || State::RuntimeModule.new(desired.driver_key) + runtime.loaded = true + emit(State::RuntimeEvent.new(:module_loaded, module_id: desired.module_id, driver_key: desired.driver_key)) + + if desired.running + payload_changed = runtime.payload != desired.payload + if runtime.running && payload_changed + runtime_manager.stop(desired.module_id) + emit(State::RuntimeEvent.new(:module_stopped, module_id: desired.module_id, driver_key: desired.driver_key, message: "payload changed")) + runtime.running = false + end + + unless runtime.running + runtime_manager.start(desired.module_id, desired.payload) + runtime.running = true + emit(State::RuntimeEvent.new(:module_started, module_id: desired.module_id, driver_key: desired.driver_key)) + end + elsif runtime.running + runtime_manager.stop(desired.module_id) + runtime.running = false + emit(State::RuntimeEvent.new(:module_stopped, module_id: desired.module_id, driver_key: desired.driver_key)) + end + + runtime.payload = desired.payload + runtime_store.save_runtime_module(desired.module_id, runtime) + rescue error + emit(State::RuntimeEvent.new(:module_failed, module_id: desired.module_id, driver_key: desired.driver_key, message: error.message)) + raise error + end + + private def unload_module(module_id : String, runtime : State::RuntimeModule) + runtime_manager.stop(module_id) if runtime.running + runtime_manager.unload(module_id) if runtime.loaded + runtime_store.delete_runtime_module(module_id) + emit(State::RuntimeEvent.new(:module_unloaded, module_id: module_id, driver_key: runtime.driver_key)) + rescue error + emit(State::RuntimeEvent.new(:module_failed, module_id: module_id, driver_key: runtime.driver_key, message: error.message)) + raise error + end + + private def emit(event : State::RuntimeEvent) + on_event.try &.call(event) + end + end +end diff --git a/src/placeos-edge/runtime_manager.cr b/src/placeos-edge/runtime_manager.cr new file mode 100644 index 00000000..649d0104 --- /dev/null +++ b/src/placeos-edge/runtime_manager.cr @@ -0,0 +1,79 @@ +require "placeos-driver/protocol/management" + +require "../placeos-core/process_manager/common" + +module PlaceOS::Edge + class RuntimeManager + include PlaceOS::Core::ProcessManager::Common + + alias ModuleError = ::PlaceOS::Core::ModuleError + + protected getter store : PlaceOS::Core::DriverStore + private getter on_setting_callback : (String, String, String ->)? + private getter on_redis_callback : (Protocol::RedisAction, String, String, String? ->)? + + def initialize( + @store : PlaceOS::Core::DriverStore = PlaceOS::Core::DriverStore.new, + @on_setting_callback : (String, String, String ->)? = nil, + @on_redis_callback : (Protocol::RedisAction, String, String, String? ->)? = nil, + ) + end + + def execute(module_id : String, payload : String | IO, user_id : String?, mod : Model::Module? = nil) + manager = protocol_manager_by_module?(module_id) + raise ModuleError.new("No protocol manager for #{module_id}") if manager.nil? + + request_body = payload.is_a?(IO) ? payload.gets_to_end : payload + manager.execute( + module_id, + request_body, + user_id: user_id, + ) + rescue error : PlaceOS::Driver::RemoteException + raise error + rescue exception + raise module_error(module_id, exception) + end + + def load(module_id : String, driver_key : String) + driver_key = PlaceOS::Core::ProcessManager.path_to_key(driver_key) + + return true if protocol_manager_by_module?(module_id) + + if existing_driver_manager = protocol_manager_by_driver?(driver_key) + set_module_protocol_manager(module_id, existing_driver_manager) + return true + end + + manager = Driver::Protocol::Management.new(store.path(driver_key).to_s, on_edge: true) + + manager.on_setting = ->(id : String, setting_name : String, setting_value : YAML::Any) { + on_setting_callback.try &.call(id, setting_name, setting_value.to_yaml) + } + + manager.on_redis = ->(action : Protocol::RedisAction, hash_id : String, key_name : String, status_value : String?) { + on_redis_callback.try &.call(action, hash_id, key_name, status_value) + } + + set_module_protocol_manager(module_id, manager) + set_driver_protocol_manager(driver_key, manager) + true + rescue exception + raise module_error(module_id, exception) + end + + def modules + protocol_manager_lock.synchronize do + @module_protocol_managers.keys.to_set + end + end + + def protocol_manager_by_driver?(driver_key : String) + super(driver_key) + end + + def protocol_manager_by_module?(module_id : String) + super(module_id) + end + end +end diff --git a/src/placeos-edge/runtime_store.cr b/src/placeos-edge/runtime_store.cr new file mode 100644 index 00000000..16d14bfd --- /dev/null +++ b/src/placeos-edge/runtime_store.cr @@ -0,0 +1,446 @@ +require "file_utils" +require "uuid" + +require "./state" +require "./constants" +require "../placeos-core/driver_manager/driver_store" + +module PlaceOS::Edge + class RuntimeStore + Log = ::Log.for(self) + + # Use the same path as driver binaries to ensure writability + # If drivers can be stored here, state can too + DEFAULT_PATH = Core::DriverStore::BINARY_PATH + + # Write debouncing: batch writes to reduce I/O + WRITE_DEBOUNCE_INTERVAL = 1.second + + getter path : String + + private getter lock = Mutex.new + private getter write_pending = Atomic(Bool).new(false) + private getter last_core_write = Atomic(Int64).new(0_i64) + + @state : State::PersistedState + + def initialize(@path : String = DEFAULT_PATH) + Dir.mkdir_p(File.join(path, "edge-state")) + @state = load_state + end + + def snapshot : State::Snapshot? + lock.synchronize { @state.snapshot } + end + + def last_snapshot_version : String? + lock.synchronize { @state.last_snapshot_version } + end + + def runtime_modules + lock.synchronize { @state.runtime_modules.dup } + end + + def pending_updates + lock.synchronize { @state.pending_updates.dup } + end + + def pending_events + lock.synchronize { @state.pending_events.dup } + end + + def pending_update_count : Int32 + lock.synchronize { @state.pending_updates.size.to_i32 } + end + + def pending_event_count : Int32 + lock.synchronize { @state.pending_events.size.to_i32 } + end + + def last_error : String? + lock.synchronize { @state.last_error } + end + + def save_snapshot(snapshot : State::Snapshot) + lock.synchronize do + @state = State::PersistedState.new( + snapshot: snapshot, + runtime_modules: @state.runtime_modules, + pending_updates: @state.pending_updates, + pending_events: @state.pending_events, + last_error: @state.last_error, + last_snapshot_version: snapshot.version + ) + + # Debounced write for core state + schedule_core_write + end + end + + def save_runtime_module(module_id : String, runtime : State::RuntimeModule) + lock.synchronize do + runtime_modules = @state.runtime_modules.dup + runtime_modules[module_id] = runtime + + @state = State::PersistedState.new( + snapshot: @state.snapshot, + runtime_modules: runtime_modules, + pending_updates: @state.pending_updates, + pending_events: @state.pending_events, + last_error: @state.last_error, + last_snapshot_version: @state.last_snapshot_version + ) + + # Debounced write for core state + schedule_core_write + end + end + + def delete_runtime_module(module_id : String) + lock.synchronize do + runtime_modules = @state.runtime_modules.dup + runtime_modules.delete(module_id) + + @state = State::PersistedState.new( + snapshot: @state.snapshot, + runtime_modules: runtime_modules, + pending_updates: @state.pending_updates, + pending_events: @state.pending_events, + last_error: @state.last_error, + last_snapshot_version: @state.last_snapshot_version + ) + + # Debounced write for core state + schedule_core_write + end + end + + # Force immediate write of all state (for testing/shutdown) + def flush + lock.synchronize do + persist_core_state + persist_pending_updates + persist_pending_events + end + end + + def queue_update(action : Protocol::RedisAction, hash_id : String, key_name : String, status_value : String?) : State::PendingRedisUpdate + update = State::PendingRedisUpdate.new(UUID.random.to_s, action, hash_id, key_name, status_value) + + lock.synchronize do + pending_updates = collapse_updates(@state.pending_updates.dup, update) + + # Apply backpressure: drop oldest updates if exceeding limit + if pending_updates.size > Edge::MAX_PENDING_UPDATES + Log.warn { "pending updates exceeded #{Edge::MAX_PENDING_UPDATES}, dropping oldest entries" } + pending_updates = pending_updates.last(Edge::MAX_PENDING_UPDATES) + end + + @state = State::PersistedState.new( + snapshot: @state.snapshot, + runtime_modules: @state.runtime_modules, + pending_updates: pending_updates, + pending_events: @state.pending_events, + last_error: @state.last_error, + last_snapshot_version: @state.last_snapshot_version + ) + + # Append to pending updates log (fast, no rewrite) + append_pending_update(update) + end + + update + end + + def queue_event(event : State::RuntimeEvent) : State::PendingRuntimeEvent + pending = State::PendingRuntimeEvent.new(UUID.random.to_s, event) + + lock.synchronize do + pending_events = collapse_events(@state.pending_events.dup, pending) + + # Apply backpressure: drop oldest events if exceeding limit + if pending_events.size > Edge::MAX_PENDING_EVENTS + Log.warn { "pending events exceeded #{Edge::MAX_PENDING_EVENTS}, dropping oldest entries" } + pending_events = pending_events.last(Edge::MAX_PENDING_EVENTS) + end + + @state = State::PersistedState.new( + snapshot: @state.snapshot, + runtime_modules: @state.runtime_modules, + pending_updates: @state.pending_updates, + pending_events: pending_events, + last_error: @state.last_error, + last_snapshot_version: @state.last_snapshot_version + ) + + # Append to pending events log (fast, no rewrite) + append_pending_event(pending) + end + + pending + end + + def acknowledge_update(update_id : String) + lock.synchronize do + @state = State::PersistedState.new( + snapshot: @state.snapshot, + runtime_modules: @state.runtime_modules, + pending_updates: @state.pending_updates.reject { |update| update.id == update_id }, + pending_events: @state.pending_events, + last_error: @state.last_error, + last_snapshot_version: @state.last_snapshot_version + ) + + # Rewrite pending updates file (compaction) + schedule_pending_compaction + end + end + + def acknowledge_event(event_id : String) + lock.synchronize do + @state = State::PersistedState.new( + snapshot: @state.snapshot, + runtime_modules: @state.runtime_modules, + pending_updates: @state.pending_updates, + pending_events: @state.pending_events.reject { |event| event.id == event_id }, + last_error: @state.last_error, + last_snapshot_version: @state.last_snapshot_version + ) + + # Rewrite pending events file (compaction) + schedule_pending_compaction + end + end + + def set_last_error(error : String?) + lock.synchronize do + @state = State::PersistedState.new( + snapshot: @state.snapshot, + runtime_modules: @state.runtime_modules, + pending_updates: @state.pending_updates, + pending_events: @state.pending_events, + last_error: error, + last_snapshot_version: @state.last_snapshot_version + ) + + # Debounced write for core state + schedule_core_write + end + end + + # Debounced write for core state changes (snapshot, modules, error) + private def schedule_core_write + return if write_pending.get + + now = Time.utc.to_unix_ms + last = last_core_write.get + + if now - last > WRITE_DEBOUNCE_INTERVAL.total_milliseconds + # Write immediately if enough time has passed + persist_core_state + last_core_write.set(now) + else + # Schedule delayed write + write_pending.set(true) + spawn do + sleep WRITE_DEBOUNCE_INTERVAL + lock.synchronize do + persist_core_state + last_core_write.set(Time.utc.to_unix_ms) + write_pending.set(false) + end + end + end + end + + # Schedule compaction of pending files (debounced) + private def schedule_pending_compaction + spawn do + sleep 5.seconds # Batch acknowledgments + lock.synchronize do + persist_pending_updates + persist_pending_events + end + end + end + + private def collapse_updates(updates : Array(State::PendingRedisUpdate), new_update : State::PendingRedisUpdate) + if new_update.action.hset? || new_update.action.set? + updates.reject! do |existing| + existing.action == new_update.action && + existing.hash_id == new_update.hash_id && + existing.key_name == new_update.key_name + end + end + + updates << new_update + updates + end + + private def collapse_events(events : Array(State::PendingRuntimeEvent), pending : State::PendingRuntimeEvent) + event = pending.event + + if event.kind.sync_status? || event.kind.snapshot_applied? + events.reject! do |existing| + existing.event.kind == event.kind + end + end + + events << pending + events + end + + private def load_state : State::PersistedState + # Load core state + core_state = if File.exists?(core_state_file) + begin + State::PersistedState.from_json(File.read(core_state_file)) + rescue error + Log.warn(exception: error) { "failed to load core state" } + State::PersistedState.new + end + else + State::PersistedState.new + end + + # Load pending updates from append-only log + pending_updates = load_pending_updates + + # Load pending events from append-only log + pending_events = load_pending_events + + # Merge into single state + State::PersistedState.new( + snapshot: core_state.snapshot, + runtime_modules: core_state.runtime_modules, + pending_updates: pending_updates, + pending_events: pending_events, + last_error: core_state.last_error, + last_snapshot_version: core_state.last_snapshot_version + ) + end + + private def load_pending_updates : Array(State::PendingRedisUpdate) + return [] of State::PendingRedisUpdate unless File.exists?(pending_updates_file) + + updates = [] of State::PendingRedisUpdate + File.each_line(pending_updates_file) do |line| + next if line.strip.empty? + updates << State::PendingRedisUpdate.from_json(line) + rescue error + Log.warn(exception: error) { "failed to parse pending update line" } + end + updates + rescue error + Log.warn(exception: error) { "failed to load pending updates" } + [] of State::PendingRedisUpdate + end + + private def load_pending_events : Array(State::PendingRuntimeEvent) + return [] of State::PendingRuntimeEvent unless File.exists?(pending_events_file) + + events = [] of State::PendingRuntimeEvent + File.each_line(pending_events_file) do |line| + next if line.strip.empty? + events << State::PendingRuntimeEvent.from_json(line) + rescue error + Log.warn(exception: error) { "failed to parse pending event line" } + end + events + rescue error + Log.warn(exception: error) { "failed to load pending events" } + [] of State::PendingRuntimeEvent + end + + # Persist core state (snapshot, modules, error) - debounced + private def persist_core_state + temp = "#{core_state_file}.tmp" + + begin + # Only persist core state, not pending items + core_only = State::PersistedState.new( + snapshot: @state.snapshot, + runtime_modules: @state.runtime_modules, + pending_updates: [] of State::PendingRedisUpdate, + pending_events: [] of State::PendingRuntimeEvent, + last_error: @state.last_error, + last_snapshot_version: @state.last_snapshot_version + ) + + File.write(temp, core_only.to_json) + File.rename(temp, core_state_file) + rescue ex : File::Error + Log.warn(exception: ex) { "failed to persist core state, continuing in-memory only" } + File.delete(temp) rescue nil + end + end + + # Append single update to log (fast, no rewrite) + private def append_pending_update(update : State::PendingRedisUpdate) + begin + File.open(pending_updates_file, "a") do |file| + file.puts(update.to_json) + end + rescue ex : File::Error + Log.warn(exception: ex) { "failed to append pending update" } + end + end + + # Append single event to log (fast, no rewrite) + private def append_pending_event(event : State::PendingRuntimeEvent) + begin + File.open(pending_events_file, "a") do |file| + file.puts(event.to_json) + end + rescue ex : File::Error + Log.warn(exception: ex) { "failed to append pending event" } + end + end + + # Rewrite pending updates file (compaction after acknowledgments) + private def persist_pending_updates + temp = "#{pending_updates_file}.tmp" + + begin + File.open(temp, "w") do |file| + @state.pending_updates.each do |update| + file.puts(update.to_json) + end + end + File.rename(temp, pending_updates_file) + rescue ex : File::Error + Log.warn(exception: ex) { "failed to compact pending updates" } + File.delete(temp) rescue nil + end + end + + # Rewrite pending events file (compaction after acknowledgments) + private def persist_pending_events + temp = "#{pending_events_file}.tmp" + + begin + File.open(temp, "w") do |file| + @state.pending_events.each do |event| + file.puts(event.to_json) + end + end + File.rename(temp, pending_events_file) + rescue ex : File::Error + Log.warn(exception: ex) { "failed to compact pending events" } + File.delete(temp) rescue nil + end + end + + private def core_state_file + File.join(path, "edge-state", "core.json") + end + + private def pending_updates_file + File.join(path, "edge-state", "pending-updates.jsonl") + end + + private def pending_events_file + File.join(path, "edge-state", "pending-events.jsonl") + end + end +end diff --git a/src/placeos-edge/server.cr b/src/placeos-edge/server.cr index d7c519ca..29737c3c 100644 --- a/src/placeos-edge/server.cr +++ b/src/placeos-edge/server.cr @@ -25,16 +25,34 @@ module PlaceOS::Edge # def manage_edge(edge_id : String, socket : HTTP::WebSocket) Log.info { {edge_id: edge_id, message: "managing edge"} } - socket.on_close do - edges_lock.write do - edges.delete(edge_id) if edges[edge_id]? == socket - end + + if edge = PlaceOS::Model::Edge.find?(edge_id) + edge.update_fields(online: true, last_seen: Time.utc) end - manager = PlaceOS::Core::ProcessManager::Edge.new(edge_id, socket) + manager = nil.as(PlaceOS::Core::ProcessManager::Edge?) + manager = PlaceOS::Core::ProcessManager::Edge.new(edge_id, socket, -> { + active = false + edges_lock.write do + current = edges[edge_id]? + active = current.same?(manager) + edges.delete(edge_id) if active + end + manager.not_nil!.disconnected! if active + }) + replaced = nil.as(PlaceOS::Core::ProcessManager::Edge?) edges_lock.write do - edges[edge_id] = manager + replaced = edges[edge_id]? + edges[edge_id] = manager.not_nil! + end + + if stale = replaced + begin + stale.transport.disconnect + rescue + nil + end end end @@ -44,15 +62,39 @@ module PlaceOS::Edge if edge = edges_lock.read { edges[edge_id]? } edge else - Log.error { "no manager found for edge #{edge_id}" } + Log.debug { "no manager found for edge #{edge_id}" } nil end end # :ditto: - def for?(edge_id : String, & : ProcessManager::Edge) + def for?(edge_id : String, & : Core::ProcessManager::Edge ->) manager = for?(edge_id) yield manager unless manager.nil? end + + def runtime_status + edges_lock.read do + edges.transform_values(&.runtime_status) + end + end + + def stop : Nil + managers = edges_lock.write do + current = edges.values.dup + edges.clear + current + end + + managers.each do |manager| + begin + manager.transport.disconnect + rescue + nil + ensure + manager.disconnected! + end + end + end end end diff --git a/src/placeos-edge/state.cr b/src/placeos-edge/state.cr new file mode 100644 index 00000000..34045b75 --- /dev/null +++ b/src/placeos-edge/state.cr @@ -0,0 +1,132 @@ +require "json" + +module PlaceOS::Edge + module State + enum RuntimeEventKind + ModuleLoaded + ModuleStarted + ModuleStopped + ModuleUnloaded + ModuleFailed + DriverReady + DriverRemoved + SnapshotApplied + SyncStatus + end + + struct DesiredDriver + include JSON::Serializable + + getter key : String + + def initialize(@key : String) + end + end + + struct DesiredModule + include JSON::Serializable + + getter module_id : String + getter driver_key : String + getter running : Bool + getter payload : String + + def initialize(@module_id : String, @driver_key : String, @running : Bool, @payload : String) + end + end + + struct Snapshot + include JSON::Serializable + + @[JSON::Field(converter: Time::EpochConverter)] + getter last_modified : Time + getter edge_id : String + getter version : String + getter drivers : Array(DesiredDriver) + getter modules : Array(DesiredModule) + + def initialize(@edge_id : String, @version : String, @last_modified : Time, @drivers : Array(DesiredDriver), @modules : Array(DesiredModule)) + end + end + + struct RuntimeModule + include JSON::Serializable + + getter driver_key : String + property loaded : Bool + property running : Bool + property payload : String? + + def initialize(@driver_key : String, @loaded : Bool = false, @running : Bool = false, @payload : String? = nil) + end + end + + struct PendingRedisUpdate + include JSON::Serializable + + getter id : String + getter action : Protocol::RedisAction + getter hash_id : String + getter key_name : String + getter status_value : String? + + def initialize(@id : String, @action : Protocol::RedisAction, @hash_id : String, @key_name : String, @status_value : String?) + end + end + + struct PendingRuntimeEvent + include JSON::Serializable + + getter id : String + getter event : RuntimeEvent + + def initialize(@id : String, @event : RuntimeEvent) + end + end + + struct RuntimeEvent + include JSON::Serializable + + @[JSON::Field(converter: Time::EpochConverter)] + getter timestamp : Time + getter kind : RuntimeEventKind + getter module_id : String? + getter driver_key : String? + getter message : String? + getter snapshot_version : String? + getter backlog_depth : Int32? + + def initialize( + @kind : RuntimeEventKind, + @timestamp : Time = Time.utc, + @module_id : String? = nil, + @driver_key : String? = nil, + @message : String? = nil, + @snapshot_version : String? = nil, + @backlog_depth : Int32? = nil, + ) + end + end + + struct PersistedState + include JSON::Serializable + + getter snapshot : Snapshot? + getter runtime_modules : Hash(String, RuntimeModule) + getter pending_updates : Array(PendingRedisUpdate) + getter pending_events : Array(PendingRuntimeEvent) + getter last_error : String? + getter last_snapshot_version : String? + + def initialize( + @snapshot : Snapshot? = nil, + @runtime_modules : Hash(String, RuntimeModule) = {} of String => RuntimeModule, + @pending_updates : Array(PendingRedisUpdate) = [] of PendingRedisUpdate, + @pending_events : Array(PendingRuntimeEvent) = [] of PendingRuntimeEvent, + @last_error : String? = nil, + @last_snapshot_version : String? = nil, + ) + end + end + end +end diff --git a/src/placeos-edge/transport.cr b/src/placeos-edge/transport.cr index 53fc8264..12929520 100644 --- a/src/placeos-edge/transport.cr +++ b/src/placeos-edge/transport.cr @@ -53,11 +53,20 @@ module PlaceOS::Edge end def connect(uri : URI, socket : HTTP::WebSocket?) + if socket + Log.debug { "core connection established" } + spawn { on_connect.try &.call } if on_connect + run_socket(socket).run + disconnect unless close_channel.closed? + return + end + SimpleRetry.try_to( base_interval: 500.milliseconds, max_interval: 5.seconds ) do |_run_count, error| if error + break if close_channel.closed? Log.warn { {error: error.to_s, message: "reconnecting"} } on_disconnect.try(&.call(error)) if error.is_a? IO::Error socket = nil @@ -67,7 +76,8 @@ module PlaceOS::Edge Log.debug { "core connection established" } spawn { on_connect.try &.call } if on_connect run_socket(socket.as(HTTP::WebSocket)).run - raise "rest api disconnected" unless close_channel.closed? + break if close_channel.closed? + raise "rest api disconnected" end rescue error disconnect @@ -86,9 +96,15 @@ module PlaceOS::Edge @ping_failures += 1 Log.debug { "keepalive ping failed #{@ping_failures} times" } - # if we've been disconnect for ~5min then we restart the service - if @ping_failures > 30 - Log.fatal { "connection failure, restarting..." } + # Log warning at 1 minute of failures + if @ping_failures == 6 + Log.warn { "websocket connection appears to be down, reconnection in progress" } + end + + # Only exit as last resort after ~10 minutes of continuous failures + # This gives reconnection logic time to work + if @ping_failures > 60 + Log.fatal { "websocket connection failed for 10+ minutes, restarting process..." } sleep(interval) exit(2) end @@ -99,11 +115,17 @@ module PlaceOS::Edge end def disconnect - close_channel.close - socket_channel.close + socket_lock.synchronize do + @socket.try(&.close) rescue nil + end + close_channel.close rescue nil + socket_channel.close rescue nil response_lock.synchronize do responses.each_value(&.close) rescue nil + responses.clear end + rescue Channel::ClosedError + nil end protected def run_socket(socket : HTTP::WebSocket) @@ -148,7 +170,7 @@ module PlaceOS::Edge end end - protected def send_response(id : UInt64, response : Protocol::Response) + protected def send_response(id : UInt64, response : Protocol::Client::Response | Protocol::Message::BinaryBody | Protocol::Message::Success) message = case response in Protocol::Message::Body Protocol::Text.new(sequence_id: id, body: response) @@ -162,6 +184,8 @@ module PlaceOS::Edge end socket_channel.send(message) + rescue Channel::ClosedError + nil end protected def send_request(request : Protocol::Request) : Protocol::Response? @@ -172,10 +196,18 @@ module PlaceOS::Edge responses[id] = response_channel end - socket_channel.send(Protocol::Text.new(sequence_id: id, body: request)) + begin + socket_channel.send(Protocol::Text.new(sequence_id: id, body: request)) + rescue Channel::ClosedError + response_lock.write do + responses.delete(id) + end + return nil + end - select - when response = response_channel.receive? + response = select + when received = response_channel.receive? + received when timeout 30.seconds raise Error::TransportTimeout.new(request) end @@ -187,6 +219,14 @@ module PlaceOS::Edge response end + protected def send_event(request : Protocol::Request) + socket_channel.send(Protocol::Text.new(sequence_id: sequence_id, body: request)) + rescue Channel::ClosedError + nil + ensure + nil + end + private def on_message(message) handle_message(Protocol::Text.from_json(message)) rescue e : JSON::ParseException @@ -210,7 +250,7 @@ module PlaceOS::Edge in Protocol::Response response_lock.read do if channel = responses[message.sequence_id]? - channel.send(body) + channel.send(body) rescue nil else Log.error { { sequence_id: message.sequence_id.to_s,