Compare commits

...

9 Commits

Author SHA1 Message Date
Marius Pana 070f6de0af new incident 2023-10-16 13:29:47 +03:00
Marius Pana 167c86618c update latest incident 2022-09-23 18:04:47 +03:00
Marius Pana 5c51b3ae40 added new incident 2022-09-19 13:05:44 +03:00
Marius Pana 6db0a4f907 add new incident 09/13/2022 2022-09-13 12:20:23 +03:00
Marius Pana be72f9b98b update latest incident 2022-09-03 12:51:13 +03:00
Marius Pana 887bb6f956 update last incident 2022-09-03 12:46:04 +03:00
Marius Pana 79afc50484 update statuskit 2022-09-03 11:32:55 +03:00
Marius Pana 9fdc8c5f35 update incident 2022-09-03 11:32:42 +03:00
Marius Pana fe03670671 add new service 2022-09-03 11:32:31 +03:00
12 changed files with 17261 additions and 11330 deletions

View File

@ -48,16 +48,9 @@ Incidents are plain markdown files inside the `site/content/incidents` directory
### Creating new incidents
Adding incidents to your status page is as simple as adding a new document to the incidents collection.
Create a new incident using npm:
Duplicate an existing incident from site/content/incidents. You can use one of the following severities:
```
npm run new-incident
```
You'll be asked a series of questions about the incident, then Hugo will generate a new file pre-filled with your responses.
After explaining the current situation in the incident, you can just push the file to GitHub. Netlify will deploy the indicent announcement for you in a matter of seconds.
["under-maintenance", "degraded-performance", "partial-outage", "major-outage"]
### Resolving incidents

17796
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -24,15 +24,15 @@
"eslint": "^5.13.0",
"eslint-plugin-import": "^2.16.0",
"exports-loader": "^0.7.0",
"gulp": "^4.0.0",
"gulp": "^4.0.2",
"gulp-babel": "^8.0.0-beta.2",
"gulp-postcss": "^8.0.0",
"gulp-postcss": "^9.0.1",
"imports-loader": "^0.8.0",
"inquirer": "^6.2.2",
"lodash.kebabcase": "^4.1.1",
"postcss-import": "^12.0.1",
"postcss-loader": "^3.0.0",
"postcss-preset-env": "^6.5.0",
"postcss-import": "^15.1.0",
"postcss-loader": "^7.3.3",
"postcss-preset-env": "^9.2.0",
"tomlify-j0.4": "^3.0.0",
"tomljs": "^0.1.3"
},
@ -41,8 +41,8 @@
},
"devDependencies": {
"babel-core": "^6.26.3",
"babel-loader": "^6.4.1",
"babel-loader": "^9.1.3",
"babel-preset-env": "^1.7.0",
"webpack": "^1.15.0"
"webpack": "^5.89.0"
}
}

View File

@ -13,4 +13,4 @@ title = "Spearhead Cloud Status"
# You'll be able to change their status every time
# you open or update an incident.
# Replace these examples with your own system names.
systems = ["EU-RO-1 DC", "CloudAPI", "DNS", "Customer Portal", "docs.spearhead.cloud"]
systems = ["EU-RO-1 DC", "CloudAPI", "CNS", "Docker API", "Customer Portal", "docs.spearhead.cloud", "Tenant Machines"]

View File

@ -0,0 +1,13 @@
+++
title = "Compute Node Network Connectivity"
date = 2022-09-13T07:45:00.000Z
severity = "partial-outage"
affectedsystems = [
"Tenant Machines"
]
resolved = true
+++
One of our compute nodes entered into a faulty state. We have gathered logs and debug information to continue our analysis.
**Update**: Server reboot has completed succesfully and all systems are operational. {{< track "2022-09-13T08:00:00.000Z" >}}

View File

@ -0,0 +1,18 @@
+++
title = "Intermittent network connectivity"
date = 2023-10-16T09:32:00.000Z
severity = "partial-outage"
affectedsystems = [
"EU-RO-1 DC",
"Tenant Machines"
]
resolved = true
+++
Our upstream datacenter provider is experiencing a DDoS attack which has affected some of our prefixes. Bot partiees (Spearhead and our Datacenter operator) are working to mitigate the effects of this attack. We can expect intermittent access to the spearhead.cloud resources while we work with our providers to stop the attack of find other measures to bring stability back to the platform.
**Update**: Network connectivity has stabilised and we are continuiing to monitor the situation. {{< track "2023-10-16T09:48:00.000Z" >}}
**Update**: Our upstream network provider has confirmed that the large scale (90Gbps) DDoS attac has been mitigated. All network operations have resumed to normal operations and we do not expect any further issues. We apologise for the inconveniece. {{< track "2023-10-16T12:31:00.000Z" >}}

View File

@ -0,0 +1,13 @@
+++
title = "Compute Node Network Connectivity"
date = 2022-09-19T07:37:00.000Z
severity = "partial-outage"
affectedsystems = [
"Tenant Machines"
]
resolved = true
+++
One of our compute nodes entered into a faulty state. We have gathered logs and debug information to continue our analysis. This is a recurring issue that is under continuous investigation
**Update**: Server reboot has completed succesfully and all systems are operational. {{< track "2022-09-10T04:00:00.000Z" >}}

View File

@ -0,0 +1,15 @@
+++
title = "Compute Node Network Connectivity"
date = 2022-09-23T14:35:00.000Z
severity = "partial-outage"
affectedsystems = [
"Tenant Machines"
]
resolved = true
+++
One of our compute nodes has entered into a faulty state. We are gathering logs and debug information to continue our analysis. This is a recurring issue that is under continuous investigation.
**Update**: We are generating a crash dump for this incident and will schedule an immediate reboot of the server. {{< track "2022-09-23T14:38:00.000Z" >}}
**Update**: We have obtained a crash dump and all systems are operating normally again. {{< track "2022-09-23T15:00:00.000Z" >}}

View File

@ -0,0 +1,16 @@
+++
title = "compute node network connectivity"
date = 2022-08-29T10:35:00.000Z
severity = "partial-outage"
affectedsystems = [
"EU-RO-1 DC",
"Tenant Machines",
]
resolved = true
+++
Our monitoring has identified intermittent (flapping) network access to customer machines on one of our compute nodes. This is limited to a single compute node and the symptoms are intermittent network connectivity (resulting in dropped sessions).
**Update**: A reboot of the compute node has restored all affected customer machines. {{< track "2022-08-18T14:34:00.000Z" >}}
**Update**: As of our current understanding we cannot replicate this issue and therefore difficult to troubeleshoot. A reboot has been applied and the systems are behaving normally. We are continuing our efforts to reproduce this issue and threfore find a permanent solution. {{< track "2022-08-18T20:34:00.000Z" >}}

View File

@ -0,0 +1,14 @@
+++
title = "intermittent network connectivity"
date = 2020-11-18T18:15:00.000Z
severity = "under-maintenance"
affectedsystems = [
"EU-RO-1 DC",
"CloudAPI"
]
resolved = true
+++
One of our management servers requires a reboot in order to perform an OS update. The update is expect to last several minutes during which time our API's will be unavailable.
**Update**: Server reboot has completed succesfully and all systems are operational. {{< track "2020-11-18T18:34:00.000Z" >}}

10679
yarn.lock

File diff suppressed because it is too large Load Diff