diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..01b019f
Binary files /dev/null and b/.DS_Store differ
diff --git a/404.html b/404.html
index 192f043..be6f116 100644
--- a/404.html
+++ b/404.html
@@ -214,7 +214,7 @@
           
             
   <li>
-    <a class="" title="Home" href="">
+    <a class="" title="Home" href="/">
       Home
     </a>
     
@@ -228,7 +228,7 @@
       
         
   <li>
-    <a class="" title="Being On-Call" href="/oncall/being_oncall">
+    <a class="" title="Being On-Call" href="/oncall/being_oncall/">
       Being On-Call
     </a>
     
@@ -237,7 +237,7 @@
       
         
   <li>
-    <a class="" title="Alerting Principles" href="/oncall/alerting_principles">
+    <a class="" title="Alerting Principles" href="/oncall/alerting_principles/">
       Alerting Principles
     </a>
     
@@ -255,7 +255,7 @@
       
         
   <li>
-    <a class="" title="Severity Levels" href="/before/severity_levels">
+    <a class="" title="Severity Levels" href="/before/severity_levels/">
       Severity Levels
     </a>
     
@@ -264,7 +264,7 @@
       
         
   <li>
-    <a class="" title="Different Roles" href="/before/different_roles">
+    <a class="" title="Different Roles" href="/before/different_roles/">
       Different Roles
     </a>
     
@@ -273,7 +273,7 @@
       
         
   <li>
-    <a class="" title="Call Etiquette" href="/before/call_etiquette">
+    <a class="" title="Call Etiquette" href="/before/call_etiquette/">
       Call Etiquette
     </a>
     
@@ -291,7 +291,7 @@
       
         
   <li>
-    <a class="" title="During An Incident" href="/during/during_an_incident">
+    <a class="" title="During An Incident" href="/during/during_an_incident/">
       During An Incident
     </a>
     
@@ -300,7 +300,7 @@
       
         
   <li>
-    <a class="" title="Security Incident" href="/during/security_incident_response">
+    <a class="" title="Security Incident" href="/during/security_incident_response/">
       Security Incident
     </a>
     
@@ -318,7 +318,7 @@
       
         
   <li>
-    <a class="" title="Post-Mortem Process" href="/after/post_mortem_process">
+    <a class="" title="Post-Mortem Process" href="/after/post_mortem_process/">
       Post-Mortem Process
     </a>
     
@@ -327,7 +327,7 @@
       
         
   <li>
-    <a class="" title="Post-Mortem Template" href="/after/post_mortem_template">
+    <a class="" title="Post-Mortem Template" href="/after/post_mortem_template/">
       Post-Mortem Template
     </a>
     
@@ -345,7 +345,7 @@
       
         
   <li>
-    <a class="" title="Overview" href="/training/overview">
+    <a class="" title="Overview" href="/training/overview/">
       Overview
     </a>
     
@@ -354,7 +354,7 @@
       
         
   <li>
-    <a class="" title="Team Leader" href="/training/team_leader">
+    <a class="" title="Team Leader" href="/training/team_leader/">
       Team Leader
     </a>
     
@@ -363,7 +363,7 @@
       
         
   <li>
-    <a class="" title="Sysadmin" href="/training/sysadmin">
+    <a class="" title="Sysadmin" href="/training/sysadmin/">
       Sysadmin
     </a>
     
@@ -372,7 +372,7 @@
       
         
   <li>
-    <a class="" title="Scribe" href="/training/scribe">
+    <a class="" title="Scribe" href="/training/scribe/">
       Scribe
     </a>
     
@@ -381,7 +381,7 @@
       
         
   <li>
-    <a class="" title="Subject Matter Expert" href="/training/subject_matter_expert">
+    <a class="" title="Subject Matter Expert" href="/training/subject_matter_expert/">
       Subject Matter Expert
     </a>
     
@@ -390,7 +390,7 @@
       
         
   <li>
-    <a class="" title="Glossary" href="/training/glossary">
+    <a class="" title="Glossary" href="/training/glossary/">
       Glossary
     </a>
     
@@ -403,7 +403,7 @@
           
             
   <li>
-    <a class="" title="About" href="/about">
+    <a class="" title="About" href="/about/">
       About
     </a>
     
diff --git a/about/index.html b/about/index.html
index e85752e..401681d 100644
--- a/about/index.html
+++ b/about/index.html
@@ -468,21 +468,22 @@
           
             <h1>About</h1>
           
-          <p>This site documents parts of the Spearhead Systems Issue Response process. It is a cut-down version of our internal documentation, used at Spearhead Systems for any incident or service request, and to prepare new employees for on-call responsibilities. It provides information not only on preparation but also what to do during and after.</p>
+          <p>This site documents parts of the Spearhead Systems technical support response process. It is a cut-down version of our internal documentation, used at Spearhead Systems for any incident or service request, and to prepare new employees for on-call responsibilities. It provides information not only on preparation but also what to do during and after.</p>
 <p>This documentation is complementary to what is available in our <a href="https://sphsys.sharepoint.com">existing wiki</a>.</p>
 <h2 id="what-is-this">What is this?<a class="headerlink" href="#what-is-this" title="Permanent link">#</a></h2>
 <p>A collection of pages detailing how to efficiently deal with any incident or service request that might arise, along with information on how to go on-call effectively. It provides lessons learned the hard way, along with training material for getting you up to speed quickly.</p>
 <h2 id="who-is-this-for">Who is this for?<a class="headerlink" href="#who-is-this-for" title="Permanent link">#</a></h2>
-<p>It is intended for on-call practitioners and those involved in an operational incident or service request response process, or those wishing to enact a formal incident response process. Specifically this is for all of our Technical Support staff.</p>
+<p>It is intended for our technical support staff and customers/partners looking for more details regarding our support process. </p>
 <h2 id="why-do-i-need-it">Why do I need it?<a class="headerlink" href="#why-do-i-need-it" title="Permanent link">#</a></h2>
-<p>As a service provider Spearhead Systems deals with service requests on a daily basis. The reason we exist is to deliver a service which in most cases boils down to incidents and service requests. We want to deliver a smooth and seamless experience for resolving our customers issues therefore this documentation is a guideline for how we handle these requests. This documentation will allow you give you a head start on how to deal with issues in a way which leads to the fastest possible recovery time.</p>
+<p>As a service provider Spearhead Systems deals with technical support requests on a daily basis. The reason we exist is to deliver our technical support services which boils down to responsind to incidents and service requests. We want to deliver a smooth and seamless experience for resolving our customers issues therefore this documentation is a guideline for how we handle these requests. This documentation will give you a head start on how to deal with issues in a way which leads to the fastest possible recovery time.</p>
 <h2 id="what-is-covered">What is covered?<a class="headerlink" href="#what-is-covered" title="Permanent link">#</a></h2>
 <p>Anything from preparing to <a href="../oncall/being_oncall/">go on-call</a>, definitions of <a href="../before/severity_levels/">severities</a>, incident <a href="../before/call_etiquette/">call etiquette</a>, all the way to how to run a <a href="../after/post_mortem_process/">post-mortem</a>, providing a <a href="../after/post_mortem_template/">post-mortem template</a> and even a <a href="../during/security_incident_response/">security incident response process</a>.</p>
 <h2 id="what-is-missing">What is missing?<a class="headerlink" href="#what-is-missing" title="Permanent link">#</a></h2>
-<p>Lots, dig in an help us complete the picture. We can migrate most processes from Sharepoint here.</p>
+<p>Lots, dig in an help us complete the picture. We can migrate most processes from Sharepoint here. We're also looking for experienced operations/support people who are willing to share their experience with us and help us provide a better support service.</p>
 <h2 id="license">License<a class="headerlink" href="#license" title="Permanent link">#</a></h2>
 <p>This documentation is provided under the Apache License 2.0. In plain English that means you can use and modify this documentation and use it both commercially and for private use. However, you must include any original copyright notices, and the original LICENSE file.</p>
 <p>Whether you are a Spearhead Systems customer or not, we want you to have the ability to use this documentation internally at your own company. You can view the source code for all of this documentation on our GitHub account, feel free to fork the repository and use it as a base for your own internal documentation.</p>
+<p>Please also check-out <a href="https://github.com/PagerDuty/incident-response-docs/">PagerDuty's</a> response documentation which has made our own efforts in documenting our process much easier.</p>
           <aside class="copyright" role="note">
             
               Copyright &copy; Spearhead Systems, Inc. &ndash;
diff --git a/after/post_mortem_process/index.html b/after/post_mortem_process/index.html
index 9c95ae6..da28cd3 100644
--- a/after/post_mortem_process/index.html
+++ b/after/post_mortem_process/index.html
@@ -475,7 +475,7 @@
           <p>For every major issue (SR/IN +major), we need to follow up with a post-mortem. A blame-free, detailed description, of exactly what went wrong in order to cause the incident, along with a list of steps to take in order to prevent a similar incident from occurring again in the future. The incident response process itself should also be included.</p>
 <p><img alt="Post-Mortem" src="../../assets/img/headers/pagerduty_post_mortem.jpg" /></p>
 <h2 id="owner-designation">Owner Designation<a class="headerlink" href="#owner-designation" title="Permanent link">#</a></h2>
-<p>The first step is that a post-mortem owner will be designated. This is done by the TL either at the end of a major incident call, or very shortly after. You will be notified directly by the TL if you are the owner for the post-mortem. The owner is responsible for populating the post-mortem page, looking up logs, managing the followup investigation, and keeping all interested parties in the loop. Please use DoIT and Slack for coordinating followup. A detailed list of the steps is available below,</p>
+<p>The first step is that a post-mortem owner will be designated. This is done by the TL either at the end of a major incident call, or very shortly after. You will be notified directly by the TL if you are the owner for the post-mortem. The owner is responsible for populating the post-mortem page, looking up logs, managing the followup investigation, and keeping all interested parties in the loop. Please use DoIT and our internal Chat for coordinating followup. A detailed list of the steps is available below,</p>
 <h2 id="owner-responsibilities">Owner Responsibilities<a class="headerlink" href="#owner-responsibilities" title="Permanent link">#</a></h2>
 <p>As owner of a post-mortem, you are responsible for the following,</p>
 <ul>
diff --git a/after/post_mortem_template/index.html b/after/post_mortem_template/index.html
index 1d7a5af..7516179 100644
--- a/after/post_mortem_template/index.html
+++ b/after/post_mortem_template/index.html
@@ -517,36 +517,13 @@ Don't wait until you've filled in the info to schedule the meeting, however make
 <p><em>Include a description what solved the problem. If there was a temporary fix in place, describe that along with the long-term solution.</em></p>
 <h2 id="impact">Impact<a class="headerlink" href="#impact" title="Permanent link">#</a></h2>
 <p><em>Be very specific here, include exact numbers.</em></p>
-<table>
-<thead>
-<tr>
-<th>Time in SR-3</th>
-<th>?mins</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td>Notifications Delivered out of SLA</td>
-<td>??% (?? of ??)</td>
-</tr>
-<tr>
-<td>Events Dropped / Not Accepted</td>
-<td>??% (?? of ??) <em>Should usually be 0, but always check</em></td>
-</tr>
-<tr>
-<td>Accounts Affected</td>
-<td>??</td>
-</tr>
-<tr>
-<td>Users Affected</td>
-<td>??</td>
-</tr>
-<tr>
-<td>Support Requests Raised</td>
-<td>?? <em>Include any relevant links to tickets</em></td>
-</tr>
-</tbody>
-</table>
+<p>| Time in SR-3 | ?mins |
+| Time in IN-3 | ?mins |
+| Notifications Delivered out of SLA | ??% (?? of ??) |
+| Events Dropped / Not Accepted | ??% (?? of ??) <em>Should usually be 0, but always check</em> |
+| Accounts Affected | ?? |
+| Users Affected | ?? |
+| Support Requests Raised | ?? <em>Include any relevant links to tickets</em> |</p>
 <h2 id="responders">Responders<a class="headerlink" href="#responders" title="Permanent link">#</a></h2>
 <ul>
 <li><em>Who was the TL?</em></li>
diff --git a/before/call_etiquette/index.html b/before/call_etiquette/index.html
index 86a31fb..911fe33 100644
--- a/before/call_etiquette/index.html
+++ b/before/call_etiquette/index.html
@@ -290,8 +290,8 @@
         <ul>
           
             <li class="anchor">
-              <a title="First Steps" href="#first-steps">
-                First Steps
+              <a title="First Steps regarding Incidents" href="#first-steps-regarding-incidents">
+                First Steps regarding Incidents
               </a>
             </li>
           
@@ -463,21 +463,21 @@
           <p>You've just joined Spearhead Systems support staff and you've never worked in a service delivery function before. You have no idea what an incident or a service request is. You have no idea what's going on, or what you're supposed to be doing. This page will help you through your first time and will provide a reference for future issues you may be a part of.</p>
 <p><img alt="Obama phone" src="../../assets/img/headers/obama_phone.jpg" />
 <em>Credit: <a href="https://commons.wikimedia.org/wiki/File:Barack_Obama_on_phone_with_Benjamin_Netanyahu_2009-06-08.jpg">Official White House Photo</a> by Pete Souza</em></p>
-<h2 id="first-steps">First Steps<a class="headerlink" href="#first-steps" title="Permanent link">#</a></h2>
+<h2 id="first-steps-regarding-incidents">First Steps regarding Incidents<a class="headerlink" href="#first-steps-regarding-incidents" title="Permanent link">#</a></h2>
 <ul>
-<li>If you intend on participating on the incident call you should join both the call, review the associated cards in DoIT, and jump on the corresponding Slack channel.</li>
+<li>If you intend on participating on the incident call you should join both the call (if there is a call), review the associated cards in DoIT, and jump on the corresponding internal Chat channel.</li>
 <li>Make sure you are in a quiet environment in order to participate on the call. Background noise should be kept to a minimum.</li>
 <li>Keep your microphone muted until you have something to say.</li>
 <li>Identify yourself when you join the call; State your name and the system you are the expert for.</li>
 <li>Speak up and speak clearly.</li>
 <li>Be direct and factual.</li>
 <li>Keep conversations/discussions short and to the point.</li>
-<li>Bring any concerns to the Team Leader (IC) on the call.</li>
+<li>Bring any concerns to the Team Leader (TL) on the call.</li>
 <li>Respect time constraints given by the Team Leader.</li>
 </ul>
 <div class="admonition warning">
 <p class="admonition-title">Incident Call</p>
-<p>Not all issues start with an incident call. Some issues may be completely automated and available only in DoIT while others may be in the incipient stages and the customer may still be on the phone/Slack detailing their issue.</p>
+<p>Not all issues start with an incident call. Some issues may be completely automated and available only in DoIT while others may be in the incipient stages and the customer may still be on the phone/internal Chat detailing their issue.</p>
 </div>
 <h2 id="lingo">Lingo<a class="headerlink" href="#lingo" title="Permanent link">#</a></h2>
 <p><strong>Use clear terminology, and avoid using acronyms or abbreviations during a call. Clear and accurate communication is more important than quick communication.</strong></p>
@@ -492,6 +492,10 @@
 <p>Do not invent new abbreviations, and always favor being explicit of implicit. It is better to make things clearer than to try and save time by abbreviating, only to have a misunderstanding because others didn't know the abbreviation.</p>
 <h2 id="the-team-leader">The Team Leader<a class="headerlink" href="#the-team-leader" title="Permanent link">#</a></h2>
 <p>The Team Leader (TL) is the leader of the incident response process, and is responsible for bringing the incident to resolution. They will announce themselves at the start of the call, and will generally be doing most of the talking.</p>
+<div class="admonition info">
+<p class="admonition-title">TL is not available</p>
+<p>A TL may not be available in which case the incident call will be guided by the senior Sysadmin or SME available.</p>
+</div>
 <ul>
 <li>Follow all instructions from the team leader, without exception.</li>
 <li>Do not perform any actions unless the team leader has told you to do so.</li>
@@ -508,11 +512,9 @@
 </ul>
 <h2 id="problems">Problems?<a class="headerlink" href="#problems" title="Permanent link">#</a></h2>
 <h4 id="theres-no-team-leader-on-the-call-i-dont-know-what-to-do">There's no team leader on the call! I don't know what to do!<a class="headerlink" href="#theres-no-team-leader-on-the-call-i-dont-know-what-to-do" title="Permanent link">#</a></h4>
-<p>Ask on the call if an TL is present. If you have no response, try asking in Slack. If there is no TL the sysadmin can take over this role temporarily.</p>
+<p>Ask on the call if an TL is present. If you have no response, try asking in our internal Chat. If there is no TL the sysadmin can take over this role temporarily.</p>
 <h4 id="there-is-not-enough-information">There is not enough information!<a class="headerlink" href="#there-is-not-enough-information" title="Permanent link">#</a></h4>
-<p>The definitive source of information for all issues is in DoIT. If at any point there is a discrepancy ask the TL or Sysadmins to provide up to date information and update the card/tasks accordingly. At a minimum information should be available in Slack.</p>
-<h4 id="i-can-join-the-call-or-slack-but-not-both-what-should-i-do">I can join the call or Slack, but not both, what should I do?<a class="headerlink" href="#i-can-join-the-call-or-slack-but-not-both-what-should-i-do" title="Permanent link">#</a></h4>
-<p>You're welcome to join only one of the channels, however you should not actively participate in the incident response if so, as it causes disjoined communication. Liaise with someone who is both in Slack and on the call to provide any input you may have so that they can raise it.</p>
+<p>The definitive source of information for all issues is in DoIT. If it is lacking there then you need to make a note of it and make sure that whoever created the card understands the importance of complete information in a timely manner. If at any point there is a discrepancy ask the TL or Sysadmins to provide up to date information and update the card/tasks accordingly. </p>
           <aside class="copyright" role="note">
             
               Copyright &copy; Spearhead Systems, Inc. &ndash;
diff --git a/before/different_roles/index.html b/before/different_roles/index.html
index 8a5a869..2569619 100644
--- a/before/different_roles/index.html
+++ b/before/different_roles/index.html
@@ -466,14 +466,14 @@
           
             <h1>Different Roles</h1>
           
-          <p>Our support services are deliviered via a flat organizational structure. The same people that deliver projects are also there to deliver ongoing support/maintenance services.
-There are several roles in our support team at Spearhead Systems. Certain roles only have one person per incident (e.g. sysadmin), whereas other roles can have multiple people (e.g. Sysadmins, Solution Architects, etc.). It's all about coming together as a team, working the problem, and getting a solution quickly.</p>
+          <p>Our support services are currently deliviered via a flat organizational structure. 
+There are however several roles in our support team at Spearhead Systems. Certain roles only have one person per incident (e.g. sysadmin), whereas other roles can have multiple people (e.g. Sysadmins, Solution Architects, etc.). It's all about coming together as a team, working the problem, and getting a solution quickly.</p>
 <p>Here is a rough outline of our role hierarchy, with each role discussed in more detail on the rest of this page.</p>
 <p><img alt="Incident Response Structure" src="../../assets/img/misc/incident_roles.png" /></p>
 <hr />
 <h2 id="team-leader-tl">Team Leader (TL)<a class="headerlink" href="#team-leader-tl" title="Permanent link">#</a></h2>
 <h3 id="what-is-it">What is it?<a class="headerlink" href="#what-is-it" title="Permanent link">#</a></h3>
-<p>A Team Leader acts as the single source of truth of what is currently happening and what is going to happen during an major incident. They come in all shapes, sizes, and colors. TL's are also the key elements in a project (boards in DoIT).</p>
+<p>A Team Leader acts as the single source of truth of what is currently happening and what is going to happen during an major incident and general ongoing support. They come in all shapes, sizes, and colors. TL's are also the key elements in a project (boards in DoIT).</p>
 <h3 id="why-have-one">Why have one?<a class="headerlink" href="#why-have-one" title="Permanent link">#</a></h3>
 <p>As any system grows in size and complexity, things break and cause incidents. The TL is needed to help drive major incidents to resolution by organizing his team towards a common goal. A TL's skillset includes project and resource management skills which are essential in driving both projects and incidents to a smooth resolution.</p>
 <h3 id="what-are-the-responsibilities">What are the responsibilities?<a class="headerlink" href="#what-are-the-responsibilities" title="Permanent link">#</a></h3>
@@ -483,6 +483,7 @@ There are several roles in our support team at Spearhead Systems. Certain roles
 <li>Create the DoIT board(s) and other project planning related materials.</li>
 <li>Funnel people to these communications channels.</li>
 <li>Train team members on how to communicate and train other TL's.</li>
+<li>Train team members and help them prepare with the proper know-how/tools to deliver the project.</li>
 </ul>
 </li>
 <li>Drive incidents and projects to resolution,<ul>
@@ -505,7 +506,7 @@ There are several roles in our support team at Spearhead Systems. Certain roles
 </li>
 </ol>
 <h3 id="who-are-they">Who are they?<a class="headerlink" href="#who-are-they" title="Permanent link">#</a></h3>
-<p>Anyone on the TL on-call schedule. Trainees are typically on the TL Shadow schedule.</p>
+<p>Anyone on the on-call schedule is a TL durin his shift. Trainees are typically on the TL Shadow schedule.</p>
 <h3 id="how-can-i-become-one">How can I become one?<a class="headerlink" href="#how-can-i-become-one" title="Permanent link">#</a></h3>
 <p>Take a look at our <a href="/training/incident_commander.md">Team Leader training guide</a>.</p>
 <hr />
@@ -513,15 +514,15 @@ There are several roles in our support team at Spearhead Systems. Certain roles
 <h3 id="what-is-it_1">What is it?<a class="headerlink" href="#what-is-it_1" title="Permanent link">#</a></h3>
 <p>A Sysadmin is a direct support role for the Team Leader. This is not a shadow where the person just observes, the Sysadmin is expected to perform important tasks during an incident.</p>
 <h3 id="why-have-one_1">Why have one?<a class="headerlink" href="#why-have-one_1" title="Permanent link">#</a></h3>
-<p>It's important for the TL to focus on the problem at hand, rather than worrying about documenting the steps or monitoring timers. The Sysadmin helps to support the TL and keep them stay focussed on the incident.</p>
+<p>It's important for the TL to focus on the problem at hand, rather than worrying about documenting the steps or monitoring timers. The Sysadmin helps to support the TL and help them stay focussed on the incident.</p>
 <h3 id="what-are-the-responsibilities_1">What are the responsibilities?<a class="headerlink" href="#what-are-the-responsibilities_1" title="Permanent link">#</a></h3>
 <p>The Sysadmin is expected to:</p>
 <ol>
 <li>Bring up issues to the TL that may otherwise not be addressed (keeping an eye on timers that have been started, circling back around to missed items from a roll call, etc).</li>
 <li>Be a "hot standby" TL, should the primary need to either transition to a SME, or otherwise have to step away from the TL role.</li>
-<li>Page SME's or other on-call engineers as instructed by the Team Leader.</li>
+<li>Call SME's or other on-call engineers as instructed by the Team Leader.</li>
 <li>Manage the incident call, and be prepared to remove people from the call if instructed by the Team Leader.</li>
-<li>Liaise with stakeholders and provide status updates on DoIT (using worklogs and comments), Slack and email/telefone as necessary.</li>
+<li>Liaise with stakeholders and provide status updates on DoIT (using worklogs and comments), internal Chat and email/telefone as necessary.</li>
 </ol>
 <h3 id="who-are-they_1">Who are they?<a class="headerlink" href="#who-are-they_1" title="Permanent link">#</a></h3>
 <p>Any Team Leader can act as a Sysadmin. Sysadmins need to be trained as an Team Leader as they may be required to take over command.</p>
@@ -537,7 +538,7 @@ There are several roles in our support team at Spearhead Systems. Certain roles
 <p>The Scribe is expected to:</p>
 <ol>
 <li>Ensure the incident call is being recorded.</li>
-<li>Note in DoIT, Slack, etc. important data, events, and actions, as they happen. Specifically:<ul>
+<li>Note in DoIT, internal Chat, etc. important data, events, and actions, as they happen. Specifically:<ul>
 <li>Key actions as they are taken (Example: "prod-server-387723 is being restarted to attempt to remove the stuck lock")</li>
 <li>Status reports when one is provided by the TL (Example: "We are in IN-Major, service A is currently not processing events due to a stuck lock, X is restarting the app stack, next checkin in 3 minutes")</li>
 <li>Any key callouts either during the call or at the ending review (Example: "Note: (Bob B) We should have a better way to determine stuck locks.")</li>
diff --git a/before/severity_levels/index.html b/before/severity_levels/index.html
index 2dbccde..fa8efa1 100644
--- a/before/severity_levels/index.html
+++ b/before/severity_levels/index.html
@@ -432,8 +432,8 @@
           
             <h1>Severity Levels</h1>
           
-          <p>The first step in any incident response process is to determine what actually constitutes an incident. We have two high level categories for classifying incidents: this is done using "SR" or "IN" defintions with an attached priority of "Minor", "Normal" or "Major". "SR" are "Service requests" initiated by a customer and usually do not constitute a critical issue (there are exceptions) while "IN" are "incidents" which are generally "urgent".</p>
-<p>All of our operational issues are to be classified as either a Service Request or an Incident. Incidents have priority over Service Requests provided that there are no Service Requests with a higher priority. In general you will want to resolve a higher severity SR or IN than a lower one (a "Major" priority gets a more intensive response than a "Normal" incident for example).</p>
+          <p>The first step in any incident response process is to determine what actually constitutes an incident. We have two high level categories for classifying incidents: these are "SR" or "IN" defintions with an attached priority of "Minor", "Normal" or "Major". "SR" are "Service requests" initiated by a customer and usually do not constitute a critical issue (there are exceptions) while "IN" are "incidents" which are generally "urgent".</p>
+<p>All issues reported to Spearhead are to be classified as either a Service Request or an Incident. Incidents have priority over Service Requests provided that there are no Service Requests with a higher priority. In general you will want to resolve a higher severity SR or IN than a lower one (a "Major" priority gets a more intensive response than a "Normal" incident for example).</p>
 <div class="admonition note">
 <p class="admonition-title">Always Assume The Worst</p>
 <p>If you are unsure which level an incident is (e.g. not sure if IN is Major or Normal), <strong>treat it as the higher one</strong>. During an incident is not the time to discuss or litigate severities, just assume the highest and review during a post-mortem.</p>
@@ -459,7 +459,7 @@
       <td>See <a href="/during/during_an_incident">During an Incident</a>.</td>
     </tr>
     <tr>
-      <td class="sev-2">Normal</td>
+      <td class="sev-1">Major</td>
       <td>
         <ul>
           <li>Functionality of virtualization platform is severely impaired.</li>
@@ -468,9 +468,7 @@
       </td>
       <td>See <a href="/during/during_an_incident">During an Incident</a>.</td>
     </tr>
-    <tr>
-      <td class="warning" colspan="3">Anything above this line is considered a "Major Incident". These are generally Incidents (IN). Below are service requests (SR) which are usually initiated by a human who can help with prioritizing. A call is triggered for all major incidents (indifferently of SR or IN).</td>
-    </tr>
+
     <tr>
       <td class="sev-2">Normal</td>
       <td>
diff --git a/during/during_an_incident/index.html b/during/during_an_incident/index.html
index 6384bd0..da34355 100644
--- a/during/during_an_incident/index.html
+++ b/during/during_an_incident/index.html
@@ -475,18 +475,18 @@
           <p>Information on what to do during a major incident. See our <a href="../../before/severity_levels/">severity level descriptions</a> for what constitutes a major incident.</p>
 <div class="admonition note">
 <p class="admonition-title">Documentation</p>
-<p>Always document your activities. Keep a detailed worklog of your actions in DoIT and communicate verbosely on Slack or other channels (email, etc.). </p>
+<p>Always document your activities. Keep a detailed worklog of your actions in DoIT and communicate verbosely in our internal Chat or other channels (email, etc.). </p>
 <p><table class="custom-table" id="contact-summary">
   <thead>
   </thead>
   <tbody>
     <tr>
-      <td><a href="#">#support</a></td>
+      <td><a href="#">#support</a> (on MS Teams/internal Chat)</td>
       <td><a href="#">http://response.spearhead.systems</a></td>
       <td><a href="#">+40728 005 263</a> </td>
     </tr>
     <tr>
-      <td colspan="3" class="centered">Need an TL? Do <code>!tl page</code> in Slack</td>
+      <td colspan="3" class="centered">Need an TL? Use a Sysadmin!</td>
     </tr>
     <tr>
       <td colspan="3"><em>For executive summary updates only, join <a href="#">#executive-summary-updates</a>.</em></td>
@@ -510,15 +510,15 @@
 <li>
 <p>Follow along with the call/chat, add any comments you feel are appropriate, but keep the discussion relevant to the problem at hand.</p>
 <ul>
-<li>If you are not an SME, try to filter any discussion through the primary SME for your service. Too many people discussing at once get become overwhelming, so we should try to maintain a hierarchical structure to the call if possible.</li>
+<li>If you are not an SME, try to filter any discussion through the primary SME for your service. Too many people discussing at once becomes overwhelming, so we try to maintain a hierarchical structure to the call if possible.</li>
 </ul>
 </li>
 <li>
 <p>Follow instructions from the Team Leader.</p>
 <ul>
 <li><strong>Is there no TL on the call?</strong><ul>
-<li>Manually page them via Slack, with <code>!tl page</code> in Slack. This will page the primary and backup TL's at the same time.</li>
-<li>Never hesitate to page the TL. It's much better to have them and not need them than the other way around.</li>
+<li>Call them! </li>
+<li>Never hesitate to call the TL. It's much better to have them and not need them than the other way around.</li>
 </ul>
 </li>
 </ul>
@@ -526,13 +526,13 @@
 </ol>
 <div class="admonition info">
 <p class="admonition-title">Not a call?</p>
-<p>Not all issues begin with a formal call. Some issues are self-explanatory and automatically generated via our monitoring platforms, a customer logging on to our portal, etc. In these scenarios <a href="http://doit.sphs.ro">DoIT</a> is the definitive source. If that is not sufficient ask your TL.</p>
+<p>Not all issues begin with a formal call. Some issues are self-explanatory and automatically generated via our monitoring platforms, a customer logging on to our portal, etc. In these scenarios <a href="http://doit.sphs.ro">DoIT</a> is the definitive source. If that is not sufficient ask your TL and Sysadmin.</p>
 </div>
 <h2 id="steps-for-the-team-leader">Steps for the Team Leader<a class="headerlink" href="#steps-for-the-team-leader" title="Permanent link">#</a></h2>
 <p>Resolve the incident as quickly and as safely as possible, use the Sysadmin to assist you. Delegate any tasks to relevant experts at your discretion.</p>
 <ol>
 <li>
-<p>Announce on the call, in DoIT and in Slack that you are the team leader, who you have designated as sysadmin (usually the backup TL), and scribe/juniors if any.</p>
+<p>Announce on the call, in DoIT and in our internal Chat that you are the team leader, who you have designated as sysadmin (usually the backup TL), and scribe/juniors if any.</p>
 </li>
 <li>
 <p>Identify if there is an obvious cause to the incident (recent deployment, spike in traffic, etc.), delegate investigation to relevant experts,</p>
@@ -559,7 +559,7 @@
 <li>
 <p>Once incident has recovered or is actively recovering, you can announce that the incident is over and that the call is ending. This usually indicates there's no more productive work to be done for the incident right now.</p>
 <ul>
-<li>Move the remaining, non-time-critical discussion to Slack.</li>
+<li>Move the remaining, non-time-critical discussion to our internal Chat.</li>
 <li>Follow up to ensure the customer liaison wraps up the incident publicly.</li>
 <li>Identify any post-incident clean-up work.</li>
 <li>You may need to perform debriefing/analysis of the underlying root cause.</li>
@@ -582,7 +582,7 @@
 <p>Be prepared to page other people as directed by the Team Leader.</p>
 </li>
 <li>
-<p>Provide regular status updates in Slack (roughly every 30mins) to the executive team, giving an executive summary of the current status. Keep it short and to the point, and use @here.</p>
+<p>Provide regular status updates in our internal Chat (roughly every 30mins) to the executive team, giving an executive summary of the current status. Keep it short and to the point, and use @<channel-name>.</p>
 </li>
 <li>
 <p>Perform any remediations, checking graphs, analysis or investigating logs unless otherwse delegated by the TL. </p>
diff --git a/during/security_incident_response/index.html b/during/security_incident_response/index.html
index 6c16cfd..ef631f2 100644
--- a/during/security_incident_response/index.html
+++ b/during/security_incident_response/index.html
@@ -548,7 +548,7 @@
 <ul>
 <li>Shutdown the instance from the provider console (do not delete or terminate if you can help it, as we'll need to do forensics).</li>
 <li>If you happen to be logged into the box you can try to,<ul>
-<li>Re-instate our default iptables rules to restrict traffic.</li>
+<li>Apply firewall rules to restrict traffic.</li>
 <li><code>kill -9</code> any active session you think is an attacker.</li>
 <li>Change root password, and update /etc/shadow to lock out all other users.</li>
 <li><code>sudo shutdown now</code></li>
@@ -559,17 +559,18 @@
 <p>Identify the likely attack vectors and path/fix them so they cannot be re-exploited immediately after stopping the attack.</p>
 <ul>
 <li>If you suspect a third-party provider is compromised, delete all accounts except your own (and those of others who are physically present) and immediately rotate your password and MFA tokens.</li>
+<li>Disable/remove ssh keys that do not belong to you and those of others who are physically present.</li>
 <li>If you suspect a service application was an attack vector, disable any relevant code paths, or shut down the service entirely.</li>
 </ul>
 <h2 id="assemble-response-team">Assemble Response Team<a class="headerlink" href="#assemble-response-team" title="Permanent link">#</a></h2>
-<p>Identify the key responders for the security incident, and keep them all in the loop. Set up a secure method of communicating all information associated with the incident. Details on the incident (or even the fact that an incident has occurred) should be kept private to the responders until you are confident the attack is not being triggered internally.</p>
+<p>Identify the key responders for the security incident, and keep them all in the loop. Set up a secure method of communicating all information associated with the incident (internal Chat is one option). Details on the incident (or even the fact that an incident has occurred) should be kept private to the responders until you are confident the attack is not being triggered internally.</p>
 <ul>
 <li>The security and site-reliability teams should usually be involved.</li>
 <li>A representative for any affected services should be involved.</li>
 <li>A Team Leader (TL) should be appointed, who will also appoint the usual incident command roles. The incident command team will be responsible for keeping documentation of actions taken, and for notifying internal stakeholders as appropriate.</li>
 <li>Do not communicate with anyone not on the response team about the incident until forensics has been performed. The attack could be happening internally.</li>
 <li>Give the project an innocuous codename that can be used for chats/documents so if anyone overhears they don't realize it's a security incident. (e.g. sapphire-unicorn).</li>
-<li>Prefix all emails, and chat topics with "Attorney Work Project".</li>
+<li>Prefix all emails, and chat topics with "Legal Work Project".</li>
 </ul>
 <h2 id="isolate-affected-instances">Isolate Affected Instances<a class="headerlink" href="#isolate-affected-instances" title="Permanent link">#</a></h2>
 <p>Any instances which were affected by the attack should be immediately isolated from any other instances. As soon as possible, an image of the system should be taken and put into a read-only cold storage for later forensic analysis.</p>
@@ -623,7 +624,7 @@
 <li>Monitor logs for any attempt to regain access to the system by the attacker.</li>
 </ul>
 <h2 id="internal-communication">Internal Communication<a class="headerlink" href="#internal-communication" title="Permanent link">#</a></h2>
-<p><strong>Delegate to:</strong> VP or Director of Engineering</p>
+<p><strong>Delegate to:</strong> CTO, GM</p>
 <p>Communicate internally only once you are confident (via forensic analysis) that the attack was not sourced internally.</p>
 <ul>
 <li>Don't go into too much detail.</li>
@@ -632,7 +633,7 @@
 <li>Follow up with more information once it is known.</li>
 </ul>
 <h2 id="liaise-with-law-enforcement-external-actors">Liaise With Law Enforcement / External Actors<a class="headerlink" href="#liaise-with-law-enforcement-external-actors" title="Permanent link">#</a></h2>
-<p><strong>Delegate to:</strong> VP or Director of Engineering</p>
+<p><strong>Delegate to:</strong> CTO, GM</p>
 <p>Work with law enforcement to identify the source of the attack, letting any system owners know that systems under their control may be compromised, etc.</p>
 <ul>
 <li>Contact local law enforcement.</li>
@@ -641,7 +642,7 @@
 <li>Contact security companies to help in assessing risk and any PR next steps.</li>
 </ul>
 <h2 id="external-communication">External Communication<a class="headerlink" href="#external-communication" title="Permanent link">#</a></h2>
-<p><strong>Delegate to:</strong> TL, Marketing Team</p>
+<p><strong>Delegate to:</strong> TL, PR/Marketing</p>
 <p>Once you have validated all of the information you have is accurate, have a timeline of events, and know exactly what information was compromised, how it was compromised, and sure that it won't happen again. Only then should you prepare and release a public statement to customers informing them of the compromised information and any steps they need to take.</p>
 <ul>
 <li>Include the date in the title of any announcement, so that it's never confused for a potential new breach.</li>
diff --git a/index.html b/index.html
index ab100c7..9f110e3 100644
--- a/index.html
+++ b/index.html
@@ -468,11 +468,11 @@
           
             <h1>Spearhead Systems Incident Response Documentation</h1>
           
-          <p>This documentation covers parts of the Spearhead Systems Incident Response process. It is a copy of <a href="https://github.com/PagerDuty/incident-response-docs/">PagerDuty's</a> documentation and furthermore a cut-down version of our own internal documentation, used at Spearhead Systems for any issue (incident or service request), and to prepare new employees for on-call responsibilities. It provides information not only on preparing for an incident or service request, but also what to do during and after. It is intended to be used by those involved in our operational technical support response process (or those wishing to become part of our support team). See the <a href="about/">about page</a> for more information on what this documentation is and why it exists. 
+          <p>This documentation covers parts of the Spearhead Systems technical support response process. It is used at Spearhead Systems for any technical issue (incident or service request), and to prepare new employees for technical support responsibilities. It provides information not only on preparing for an incident or service request, but also what to do during and after. It is intended to be used by those involved in our operational technical support response process (or those wishing to become part of our support team). See the <a href="about/">about page</a> for more information on what this documentation is and why it exists. 
 This documentation is complementary to what is available in our <a href="https://sphsys.sharepoint.com">existing wiki</a> and other systems that have not been open sourced.</p>
 <div class="admonition note">
-<p class="admonition-title">Issue, Incident and Service Request</p>
-<p>At Spearhead we use the term <em>issue</em> to define any request from our customers. Issues fall into two categories: "Service Requests (SR)" and "Incidents (IN)". An IN will generally be an issue that has impact on the normal functioning of the business while a SR generally does not.</p>
+<p class="admonition-title">Issue: Incidents and Service Request</p>
+<p>At Spearhead we use the term <em>issue</em> to define any request that we receive. Issues fall into two categories: "Service Requests (SR)" and "Incidents (IN)". An IN will generally be an issue that has impact on the normal functioning of the business while a SR generally does not.</p>
 </div>
 <p><img alt="Incident Response at Spearhead Systems" src="./assets/img/headers/sph_ir.jpg" /></p>
 <h2 id="being-on-call">Being On-Call<a class="headerlink" href="#being-on-call" title="Permanent link">#</a></h2>
@@ -482,7 +482,7 @@ This documentation is complementary to what is available in our <a href="https:/
 <li><a href="oncall/alerting_principles/">Alerting Principles</a> - <em>The principles we use to determine what things notify an engineer, and what time of day they do so.</em></li>
 </ul>
 <h2 id="before-an-incident">Before an Incident<a class="headerlink" href="#before-an-incident" title="Permanent link">#</a></h2>
-<p>Reading material for things you probably want to know before an incident occurs. You likely don't want to be reading these during an actual incident.</p>
+<p>Reading material for things you want to know before an incident occurs. You don't want to be reading these during an actual incident.</p>
 <ul>
 <li><a href="before/severity_levels/">Severity Levels</a> - <em>Information on our severity level classification. What constitutes a Low issue? What's a "Major Incident"?, etc.</em></li>
 <li><a href="before/different_roles/">Different Roles for Incidents</a> - <em>Information on the roles during an incident; Team Leader, Sysadmin, etc.</em></li>
diff --git a/mkdocs/search_index.json b/mkdocs/search_index.json
index 5d4c0e6..e87b566 100644
--- a/mkdocs/search_index.json
+++ b/mkdocs/search_index.json
@@ -2,7 +2,7 @@
     "docs": [
         {
             "location": "/", 
-            "text": "This documentation covers parts of the Spearhead Systems Incident Response process. It is a copy of \nPagerDuty's\n documentation and furthermore a cut-down version of our own internal documentation, used at Spearhead Systems for any issue (incident or service request), and to prepare new employees for on-call responsibilities. It provides information not only on preparing for an incident or service request, but also what to do during and after. It is intended to be used by those involved in our operational technical support response process (or those wishing to become part of our support team). See the \nabout page\n for more information on what this documentation is and why it exists. \nThis documentation is complementary to what is available in our \nexisting wiki\n and other systems that have not been open sourced.\n\n\n\n\nIssue, Incident and Service Request\n\n\nAt Spearhead we use the term \nissue\n to define any request from our customers. Issues fall into two categories: \"Service Requests (SR)\" and \"Incidents (IN)\". An IN will generally be an issue that has impact on the normal functioning of the business while a SR generally does not.\n\n\n\n\n\n\nBeing On-Call\n#\n\n\nIf you've never been on-call before or part of a support delivery team, you might be wondering what it's all about. These pages describe what the expectations are, along with some resources to help you.\n\n\n\n\nBeing On-Call\n - \nA guide to being on-call, both what your responsibilities are, and what they are not.\n\n\nAlerting Principles\n - \nThe principles we use to determine what things notify an engineer, and what time of day they do so.\n\n\n\n\nBefore an Incident\n#\n\n\nReading material for things you probably want to know before an incident occurs. You likely don't want to be reading these during an actual incident.\n\n\n\n\nSeverity Levels\n - \nInformation on our severity level classification. What constitutes a Low issue? What's a \"Major Incident\"?, etc.\n\n\nDifferent Roles for Incidents\n - \nInformation on the roles during an incident; Team Leader, Sysadmin, etc.\n\n\nIncident Call Etiquette\n - \nOur etiquette guidelines for incident calls, before you find yourself in one.\n\n\n\n\nDuring an Incident\n#\n\n\nInformation and processes during an incident.\n\n\n\n\nDuring an Incident\n - \nInformation on what to do during an incident, and how to constructively contribute.\n\n\nSecurity Incident Response\n - \nSecurity incidents are handled differently to normal operational incidents.\n\n\n\n\nAfter an Incident\n#\n\n\nOur followup processes, how we make sure we don't repeat mistakes and are always improving.\n\n\n\n\nPost-Mortem Process\n - \nInformation on our post-mortem process; what's involved and how to write or run a post-mortem.\n\n\nPost-Mortem Template\n - \nThe template we use for writing our post-mortems for major incidents.\n\n\n\n\nTraining\n#\n\n\nSo, you want to learn about incident response? You've come to the right place.\n\n\n\n\nTraining Overview\n - \nAn overview of our training guides and additional training material from third-parties.\n\n\nIncident Commander Training\n - \nA guide to becoming our next Incident Commander.\n\n\nDeputy Training\n - \nHow to be a deputy and back up the Incident Commander.\n\n\nScribe Training\n - \nA guide to scribing.\n\n\nSubject Matter Expert Training\n - \nA guide on responsibilities and behavior for all participants in a major incident.\n\n\nGlossary of Incident Response Terms\n - \nA collection of terms that you may hear being used, along with their definition.\n\n\n\n\nAdditional Reading\n#\n\n\nUseful material and resources from external parties that are relevant to incident response.\n\n\n\n\nIncident Management for Operations\n (O'Reilly)\n\n\nIncident Response\n (O'Reilly)\n\n\nDebriefing Facilitation Guide\n (Etsy)\n\n\nUS National Incident Management System (NIMS)\n (FEMA)\n\n\nEvery Minute Counts: Leading Heroku's Incident Response\n (Blake Gentry)", 
+            "text": "This documentation covers parts of the Spearhead Systems technical support response process. It is used at Spearhead Systems for any technical issue (incident or service request), and to prepare new employees for technical support responsibilities. It provides information not only on preparing for an incident or service request, but also what to do during and after. It is intended to be used by those involved in our operational technical support response process (or those wishing to become part of our support team). See the \nabout page\n for more information on what this documentation is and why it exists. \nThis documentation is complementary to what is available in our \nexisting wiki\n and other systems that have not been open sourced.\n\n\n\n\nIssue: Incidents and Service Request\n\n\nAt Spearhead we use the term \nissue\n to define any request that we receive. Issues fall into two categories: \"Service Requests (SR)\" and \"Incidents (IN)\". An IN will generally be an issue that has impact on the normal functioning of the business while a SR generally does not.\n\n\n\n\n\n\nBeing On-Call\n#\n\n\nIf you've never been on-call before or part of a support delivery team, you might be wondering what it's all about. These pages describe what the expectations are, along with some resources to help you.\n\n\n\n\nBeing On-Call\n - \nA guide to being on-call, both what your responsibilities are, and what they are not.\n\n\nAlerting Principles\n - \nThe principles we use to determine what things notify an engineer, and what time of day they do so.\n\n\n\n\nBefore an Incident\n#\n\n\nReading material for things you want to know before an incident occurs. You don't want to be reading these during an actual incident.\n\n\n\n\nSeverity Levels\n - \nInformation on our severity level classification. What constitutes a Low issue? What's a \"Major Incident\"?, etc.\n\n\nDifferent Roles for Incidents\n - \nInformation on the roles during an incident; Team Leader, Sysadmin, etc.\n\n\nIncident Call Etiquette\n - \nOur etiquette guidelines for incident calls, before you find yourself in one.\n\n\n\n\nDuring an Incident\n#\n\n\nInformation and processes during an incident.\n\n\n\n\nDuring an Incident\n - \nInformation on what to do during an incident, and how to constructively contribute.\n\n\nSecurity Incident Response\n - \nSecurity incidents are handled differently to normal operational incidents.\n\n\n\n\nAfter an Incident\n#\n\n\nOur followup processes, how we make sure we don't repeat mistakes and are always improving.\n\n\n\n\nPost-Mortem Process\n - \nInformation on our post-mortem process; what's involved and how to write or run a post-mortem.\n\n\nPost-Mortem Template\n - \nThe template we use for writing our post-mortems for major incidents.\n\n\n\n\nTraining\n#\n\n\nSo, you want to learn about incident response? You've come to the right place.\n\n\n\n\nTraining Overview\n - \nAn overview of our training guides and additional training material from third-parties.\n\n\nIncident Commander Training\n - \nA guide to becoming our next Incident Commander.\n\n\nDeputy Training\n - \nHow to be a deputy and back up the Incident Commander.\n\n\nScribe Training\n - \nA guide to scribing.\n\n\nSubject Matter Expert Training\n - \nA guide on responsibilities and behavior for all participants in a major incident.\n\n\nGlossary of Incident Response Terms\n - \nA collection of terms that you may hear being used, along with their definition.\n\n\n\n\nAdditional Reading\n#\n\n\nUseful material and resources from external parties that are relevant to incident response.\n\n\n\n\nIncident Management for Operations\n (O'Reilly)\n\n\nIncident Response\n (O'Reilly)\n\n\nDebriefing Facilitation Guide\n (Etsy)\n\n\nUS National Incident Management System (NIMS)\n (FEMA)\n\n\nEvery Minute Counts: Leading Heroku's Incident Response\n (Blake Gentry)", 
             "title": "Home"
         }, 
         {
@@ -12,7 +12,7 @@
         }, 
         {
             "location": "/#before-an-incident", 
-            "text": "Reading material for things you probably want to know before an incident occurs. You likely don't want to be reading these during an actual incident.   Severity Levels  -  Information on our severity level classification. What constitutes a Low issue? What's a \"Major Incident\"?, etc.  Different Roles for Incidents  -  Information on the roles during an incident; Team Leader, Sysadmin, etc.  Incident Call Etiquette  -  Our etiquette guidelines for incident calls, before you find yourself in one.", 
+            "text": "Reading material for things you want to know before an incident occurs. You don't want to be reading these during an actual incident.   Severity Levels  -  Information on our severity level classification. What constitutes a Low issue? What's a \"Major Incident\"?, etc.  Different Roles for Incidents  -  Information on the roles during an incident; Team Leader, Sysadmin, etc.  Incident Call Etiquette  -  Our etiquette guidelines for incident calls, before you find yourself in one.", 
             "title": "Before an Incident"
         }, 
         {
@@ -37,17 +37,17 @@
         }, 
         {
             "location": "/oncall/being_oncall/", 
-            "text": "A summary of expectations and helpful information for being on-call.\n\n\n\n\nWhat is On-Call?\n#\n\n\nAt Spearhead being on-call means that you are able to be contacted at any time in order to investigate and fix issues that may arise. There are two on-call scenarios that you will deal with:\n\n\n\n\nduring your normal work shift\n\n\nbeing on-call for outside working hours\n\n\n\n\nFor example, if you are on-call outside normal working hours, should any alarms be triggered by our monitoring solution, you will receive a \"page\" (an alert on your mobile device, email, phone call, or SMS, etc.) giving you details on what has broken. You will be expected to take whatever actions are necessary in order to resolve the issue and return your service to a normal state. \n\n\nAt Spearhead Systems we consider you are on-call during normal working hours in which case you are proactively working with \nDoIT\n and looking over your assigned cards/boards as well as when you are formally \"on-call\" and issues are being redirected to you.\n\n\nOn-call responsibilities extend beyond normal office hours, and if you are on-call you are expected to be able to respond to issues, even at 2am. This sounds horrible (and it can be), but this is what our customers go through, and is the problem that the Spearhead Systems professional services is trying to fix!\n\n\nResponsibilities\n#\n\n\n\n\n\n\nPrepare\n\n\n\n\nHave your laptop and Internet with you (office, home, a MiFi dongle, a phone with a tethering plan, etc).\n\n\nHave a way to charge your MiFi.\n\n\n\n\n\n\nTeam alert escalation happens within 5 minutes, set/stagger your notification timeouts (push, SMS, phone...) accordingly.\n\n\nMake sure Spearhead Systems (and colleagues directly) texts and calls can bypass your \"Do Not Disturb\" settings.\n\n\n\n\n\n\nBe prepared (environment is set up, you have remote access tools ready and functional, your credentials are current, you have Java installed, ssh-keys and so on...)\n\n\nRead our Issue Response documentation (that's this!) to understand how we handle incidents and service requests, what the different roles and methods of communication are, etc.\n\n\nBe aware of your upcoming on-call time (primary, backup) and arrange swaps around travel, vacations, appointments etc.\n\n\n\n\n\n\n\n\nTriage\n\n\n\n\nAcknowledge and act on alerts whenever you can (see the first \"Not responsibilities\" point below)\n\n\nDetermine the urgency of the problem:\n\n\nIs it something that should be worked on right now or escalated into a major incident? (\"production server on fire\" situations. Security alerts) - do so.\n\n\nIs it some tactical work that doesn't have to happen during the night? (for example, disk utilization high watermark, but there's plenty of space left and the trend is not indicating impending doom) - snooze the alert until a more suitable time (working hours, the next morning...) and get back to fixing it then.\n\n\n\n\n\n\nCheck Slack for current activity. Often (but not always) actions that could potentially cause alerts will be announced there.\n\n\nDoes the alert and your initial investigation indicate a general problem or an issue with a specific service that the relevant team should look into? If it does not look like a problem you are the expert for, then escalate to another team member or group.\n\n\n\n\n\n\n\n\nFix\n\n\n\n\nYou are empowered to dive into any problem and act to fix it.\n\n\nInvolve other team members as necessary: do not hesitate to escalate if you cannot figure out the cause within a reasonable timeframe or if the service / alert is something you have not tackled before.\n\n\nIf the issue is not very time sensitive and you have other priority work, make a note of this in DoIT to keep a track of it (with an appropriate severity and due date).\n\n\n\n\n\n\n\n\nImprove\n\n\n\n\nIf a particular issue keeps happening; if an issue alerts often but turns out to be a preventable non-issue \u2013 perhaps improving this should be a longer-term task.\n\n\nDisks that fill up, logs that should be rotated, noisy alerts...(we use ansible, go ahead and start automating!)\n\n\n\n\n\n\nIf information is difficult / impossible to find, write it down. Constantly refactor and improve our knowledge base and documentation. Add redundant links and pointers if your mental model of the wiki / codebase does not match the way it is currently organized.\n\n\n\n\n\n\n\n\nSupport\n\n\n\n\nWhen your on-call \"shift\" ends, let the next on-call and team know about issues that have not been resolved yet and other experiences of note.\n\n\nMake an effort to cleanly handover necessary information. We use Slack, email and DoIT to communicate. \n\n\nThis is a best-practice that should be applied whenever there are details that by sharong would benefit the efficiency of the team.\n\n\n\n\n\n\nIf you are making a change that impacts the schedule (adding / removing yourself, for example), let others know since many of us make arrangements around the on-call schedule well in advance.\n\n\nSupport each other: when doing activities that might generate plenty of pages, it is courteous to \"take the page\" away from the on-call by notifying them and scheduling an override for the duration.\n\n\n\n\n\n\n\n\nNot Responsibilities\n#\n\n\n\n\n\n\nNo expectation to be the first to acknowledge \nall\n of the alerts during the on-call period.\n\n\n\n\nCommute (and other necessary distractions) are facts of life, and sometimes it is not possible to receive or act on an alert before it escalates. That's why we have the backup on-call and schedule for.\n\n\n\n\n\n\n\n\nNo expectation to fix all issues by yourself.\n\n\n\n\nNo one knows everything. Your whole team is here to help. There is no shame, and much to be learned, by escalating issues you are not certain about. \"Never hesitate to escalate\".\n\n\nService owners will always know more about how their stuff works. Especially if our and their documentation is lacking, double-checking with the relevant team avoids mistakes. Measure twice, cut once \u2013 and it's often best to let the subject matter expert do the cutting.\n\n\n\n\n\n\n\n\nRecommendations\n#\n\n\n\n\n\n\nAlways have a backup schedule. Yes, this means two people being on-call at the same time, however it takes a lot of the stress off of the primary if they know they have a specific backup they can contact, rather than trying to chose a random member of the team. \n\n\n\n\n\n\nThe third-level of your escalation (after backup schedule) should probably be your entire team. This should hopefully never happen, but when it does, it's useful to be able to just get the next available person.\n\n\n\n\n\n\n\n\n\n\n\n\nTeam leaders (TL) can (and should) be part of your normal rotation. It gives a better insight into what has been going on.\n\n\n\n\n\n\nNew members of the team should shadow your on-call rotation during the first few weeks. They should get all alerts, and should follow along with what you are doing. (All new employees shadow the Support team for one week of on-call, but it's useful to have new team members shadow your team rotations also. Just not at the same time).\n\n\n\n\n\n\nOur escalation timeout is set to 5 minutes. This is usually plenty of time for someone to acknowledge the incident if they're able to. If they're not able to within 5 minutes, then they're probably not in a good position to respond to the incident anyway.\n\n\n\n\nTriggering an escalation is done automatically in most situations based on the type, priority and severity of the issue.\n\n\n\n\n\n\n\n\nWhen going off-call, you should provide a quick summary to the next on-call about any issues that may come up during their shift. A service has been flapping, an issue is likely to re-occur, etc. If you want to be formal, this can be a written report via email, but generally a verbal summary is sufficient.\n\n\n\n\n\n\nNotification Method Recommendations\n#\n\n\nYou are free to set up your notification rules as you see fit, to match how you would like to best respond to incidents. If you're not sure how to configure them, the Support team has some recommendations,\n\n\n\n\n\n\nUse Push Notification and Email as your first method of notification. Most of us have phones with us at all times, so this is a prudent first method and is usually sufficient. (DoIT is in the process of integratoin with SNS for push notifications)\n\n\nUse Phone and/or SMS notification each minute after, until the escalation time. If Push didn't work, then it's likely you need something stronger, like a phone call. Keep calling every minute until it's too late. If you don't pick up by the 3rd time, then it's unlikely you are able to respond, and the incident will get escalated away from you.\n\n\n\n\nEtiquette\n#\n\n\n\n\n\n\nIf the current on-call comes into the office at 12pm looking tired, it's not because they're lazy. They probably got paged in the night. Cut them some slack and be nice.\n\n\n\n\n\n\nDon't acknowledge an incident out from under someone else. If you didn't get paged for the incident, then you shouldn't be acknowledging it. Add a comment with your notes instead.\n\n\n\n\n\n\n\n\n\n\n\n\nIf you are testing something, or performing an action that you know will cause a page (notification, alert), it's customary to \"take the pager\" for the time during which you will be testing. Notify the person on-call that you are taking the pager for the next hour while you test.\n\n\n\n\n\n\n\"Never hesitate to escalate\" - Never feel ashamed to rope in someone else if you're not sure how to resolve an issue. Likewise, never look down on someone else if they ask you for help.\n\n\n\n\n\n\nAlways consider covering an hour or so of someone else's on-call time if they request it and you are able. We all have lives which might get in the way of on-call time, and one day it might be you who needs to swap their on-call time in order to have a night out with your friend from out of town.\n\n\n\n\n\n\nIf an issue comes up during your on-call shift for which you got paged, you are responsible for resolving it. Even if it takes 3 hours and there's only 1 hour left of your shift. You can hand over to the next on-call if they agree, but you should never assume that's possible.", 
+            "text": "A summary of expectations and helpful information for being on-call.\n\n\n\n\nWhat is On-Call?\n#\n\n\nAt Spearhead, being on-call means that you are responsible for monitoring our communications channels and responding to requests at any time. There are two on-call scenarios that you will deal with:\n\n\n\n\nduring your normal work shift\n\n\noutside working hours\n\n\n\n\nFor example, if you are on-call outside normal working hours, should any alarms be triggered by our monitoring solution or a customer emails our support channel, you will receive a \"notification\" (an alert on your mobile device, email, phone call, or SMS, etc.) giving you details on what has broken. \nYou will be expected to gather as much information as possible, create the required cards in our ticketing systems, delegate or assign the card to the right person/watchers and otherwise take whatever actions are necessary in order to resolve the issue. \n\n\n\n\n\nOn-call responsibilities extend beyond normal office hours, and if you are on-call you are expected to be able to respond to issues, even at 2am. This sounds horrible (and it can be), but this is what our customers go through, and is the problem that the Spearhead Systems technical support services is trying to fix!\n\n\nWhen you are on-call during normal working hours you are the central contact for our entire support team. We expect you will delegate and assign the card to your colleagues and only attempt to resolve issues if your current workload permits. \nWhen you are on-call outside working hours you are expected to handle as much of the process as possible and delegate only if it is outside your area of expertise or you encounter problems that require another colleagues input.\n\n\n\n\nWhen in the office\n\n\nYou are generally speaking on-call during your normal working hours even if you are not \nthe\n on-call engineer. This means you are keeping an eye on the cards assigned to you directly or that you are a watcher for. If you are ever in a position that you have no assigned cards and it is not clear what to work on ask a TL or senior Sysadmin to help point you in the right direction.\n\n\n\n\nResponsibilities\n#\n\n\n\n\n\n\nPrepare\n\n\n\n\nHave your laptop and Internet with you (office, home, a phone with a tethering plan, etc).\n\n\nHave a way to charge your phone.\n\n\n\n\n\n\nTeam alert escalation happens within 30 minutes, set/stagger your notification timeouts (push, SMS, phone...) accordingly.\n\n\nMake sure Spearhead Systems (and colleagues directly) texts and calls can bypass your \"Do Not Disturb\" settings.\n\n\n\n\n\n\nBe prepared (environment is set up, you have remote access tools ready and functional, your credentials are current, you have Java installed, ssh-keys and so on...)\n\n\nRead our Issue Response documentation (that's this!) to understand how we handle incidents and service requests, what the different roles and methods of communication are, etc.\n\n\nBe aware of your upcoming on-call time (primary, backup) and arrange swaps around travel, vacations, appointments etc.\n\n\n\n\n\n\n\n\nTriage\n\n\n\n\nAcknowledge and act on alerts whenever you can (see the first \"Not responsibilities\" point below)\n\n\nDetermine the urgency of the problem:\n\n\nIs it something that should be worked on right now or escalated into a major incident? (\"production server on fire\" situations. Security alerts) - do so.\n\n\nIs it some tactical work that doesn't have to happen during the night? (for example, disk utilization high watermark, but there's plenty of space left and the trend is not indicating impending doom) - snooze the issue until a more suitable time (working hours, the next morning...) and get back to fixing it then.\n\n\n\n\n\n\nCheck our \ninternal Chat\n for current activity. Often (but not always) actions that could potentially cause alerts will be announced there.\n\n\nDoes the alert and your initial investigation indicate a general problem or an issue with a specific service that the relevant team should look into? If it does not look like a problem you are the expert for, then escalate to another team member or group.\n\n\n\n\n\n\n\n\nFix\n\n\n\n\nYou are empowered to dive into any problem and act to fix it.\n\n\nInvolve other team members as necessary: do not hesitate to escalate if you cannot figure out the cause within a reasonable timeframe or if the service / alert is something you have not tackled before.\n\n\nIf the issue is not very time sensitive and you have other priority work, make a note of this in DoIT to keep a track of it (with an appropriate severity, comment and due date).\n\n\n\n\n\n\n\n\nImprove\n\n\n\n\nIf a particular issue keeps happening; if an issue alerts often but turns out to be a preventable non-issue \u2013 perhaps improving this should be a longer-term task.\n\n\nDisks that fill up, logs that should be rotated, noisy alerts...(we use ansible and rundeck, go ahead and start automating!)\n\n\nWhen we perform a DoD (definition of done) this is good time to bring up recurring issues for discussion.\n\n\n\n\n\n\nIf information is difficult / impossible to find, write it down. Constantly refactor and improve our knowledge base and documentation. Add redundant links and pointers if your mental model of the wiki / codebase does not match the way it is currently organized.\n\n\n\n\n\n\n\n\nSupport\n\n\n\n\nWhen your on-call \"shift\" ends, let the next on-call and team know about issues that have not been resolved yet and other experiences of note.\n\n\nMake an effort to cleanly handover necessary information. We use \ninternal Chat\n, email and DoIT to communicate. \n\n\nThis is a best-practice that should be applied whenever there are details that by sharing would benefit the efficiency of the team.\n\n\n\n\n\n\nIf you are making a change that impacts the schedule (adding / removing yourself, for example), let others know since many of us make arrangements around the on-call schedule well in advance.\n\n\nSupport each other: when doing activities that might generate plenty of alerts, it is courteous to \"place the service/host in maintenance\" and take it away from the on-call by notifying them and scheduling an override for the duration.\n\n\n\n\n\n\n\n\nNot Responsibilities\n#\n\n\n\n\n\n\nNo expectation to be the first to acknowledge \nall\n of the alerts during the on-call period.\n\n\n\n\nCommute (and other necessary distractions) are facts of life, and sometimes it is not possible to receive or act on an alert before it escalates. That's why we have the backup on-call and schedule for.\n\n\n\n\n\n\n\n\nNo expectation to fix all issues by yourself.\n\n\n\n\nNo one knows everything. Your whole team is here to help. There is no shame, and much to be learned, by escalating issues you are not certain about. \"Never hesitate to escalate\".\n\n\nService owners will always know more about how their stuff works. Especially if our and their documentation is lacking, double-checking with the relevant team avoids mistakes. Measure twice, cut once \u2013 and it's often best to let the subject matter expert do the cutting.\n\n\n\n\n\n\n\n\nRecommendations\n#\n\n\n\n\n\n\nAlways have a backup schedule. Yes, this means two people being on-call at the same time, however it takes a lot of the stress off of the primary if they know they have a specific backup they can contact, rather than trying to chose a random member of the team. \n\n\n\n\n\n\nThe third-level of your escalation (after backup schedule) should probably be your entire team. This should hopefully never happen, but when it does, it's useful to be able to just get the next available person.\n\n\n\n\n\n\n\n\n\n\n\n\nTeam leaders (TL) are a part of our normal rotation. It gives a better insight into what has been going on.\n\n\n\n\n\n\nNew members of the team should shadow your on-call rotation during the first few weeks. They should get all alerts, and should follow along with what you are doing. (All new employees shadow the Support team for one week of on-call, but it's useful to have new team members shadow your team rotations also.).\n\n\n\n\n\n\n\n\n\n\n\nWhen going off-call, you should provide a quick summary to the next on-call about any issues that may come up during their shift. A service has been flapping, an issue is likely to re-occur, etc. If you want to be formal, this can be a written report via email, but generally a verbal summary during our morning stand-up is sufficient.\n\n\n\n\nNotification Method Recommendations\n#\n\n\nYou are free to set up your notification rules as you see fit, to match how you would like to best respond to incidents. If you're not sure how to configure them, the Support team has some recommendations,\n\n\n\n\n\n\n\nEtiquette\n#\n\n\n\n\n\n\nIf the current on-call comes into the office at 12pm looking tired, it's not because they're lazy. They probably got paged in the night. Cut them some slack and be nice.\n\n\n\n\n\n\nDon't close or otherwise modify a card out from under someone else. If you didn't get that specific card assigned to you as owner or a watcher, then you shouldn't be modifying it. Add a comment with your notes instead in the monitoring system and in DoIT.\n\n\n\n\n\n\n\n\n\n\n\n\nIf you are testing something, or performing an action that you know will cause an alert from our monitoring or possibly may be identified as an issue by our customers, it's customary to \"place the host/service in downtime\" and announce all the involved parties, for the time during which you will be testing. Notify the person on-call so they are aware of your testing.\n\n\n\n\n\n\n\"Never hesitate to escalate\" - Never feel ashamed to rope in someone else if you're not sure how to resolve an issue. Likewise, never look down on someone else if they ask you for help.\n\n\n\n\n\n\nAlways consider covering an hour or so of someone else's on-call time if they request it and you are able. We all have lives which might get in the way of on-call time, and one day it might be you who needs to swap their on-call time in order to have a night out with your friend from out of town.\n\n\n\n\n\n\nIf an issue comes up during your on-call shift for which you got called, you are responsible for resolving it. Even if it takes 3 hours and there's only 1 hour left of your shift. You can hand over to the next on-call if they agree, but you should never assume that's possible.", 
             "title": "Being On-Call"
         }, 
         {
             "location": "/oncall/being_oncall/#what-is-on-call", 
-            "text": "At Spearhead being on-call means that you are able to be contacted at any time in order to investigate and fix issues that may arise. There are two on-call scenarios that you will deal with:   during your normal work shift  being on-call for outside working hours   For example, if you are on-call outside normal working hours, should any alarms be triggered by our monitoring solution, you will receive a \"page\" (an alert on your mobile device, email, phone call, or SMS, etc.) giving you details on what has broken. You will be expected to take whatever actions are necessary in order to resolve the issue and return your service to a normal state.   At Spearhead Systems we consider you are on-call during normal working hours in which case you are proactively working with  DoIT  and looking over your assigned cards/boards as well as when you are formally \"on-call\" and issues are being redirected to you.  On-call responsibilities extend beyond normal office hours, and if you are on-call you are expected to be able to respond to issues, even at 2am. This sounds horrible (and it can be), but this is what our customers go through, and is the problem that the Spearhead Systems professional services is trying to fix!", 
+            "text": "At Spearhead, being on-call means that you are responsible for monitoring our communications channels and responding to requests at any time. There are two on-call scenarios that you will deal with:   during your normal work shift  outside working hours   For example, if you are on-call outside normal working hours, should any alarms be triggered by our monitoring solution or a customer emails our support channel, you will receive a \"notification\" (an alert on your mobile device, email, phone call, or SMS, etc.) giving you details on what has broken. \nYou will be expected to gather as much information as possible, create the required cards in our ticketing systems, delegate or assign the card to the right person/watchers and otherwise take whatever actions are necessary in order to resolve the issue.    On-call responsibilities extend beyond normal office hours, and if you are on-call you are expected to be able to respond to issues, even at 2am. This sounds horrible (and it can be), but this is what our customers go through, and is the problem that the Spearhead Systems technical support services is trying to fix!  When you are on-call during normal working hours you are the central contact for our entire support team. We expect you will delegate and assign the card to your colleagues and only attempt to resolve issues if your current workload permits. \nWhen you are on-call outside working hours you are expected to handle as much of the process as possible and delegate only if it is outside your area of expertise or you encounter problems that require another colleagues input.   When in the office  You are generally speaking on-call during your normal working hours even if you are not  the  on-call engineer. This means you are keeping an eye on the cards assigned to you directly or that you are a watcher for. If you are ever in a position that you have no assigned cards and it is not clear what to work on ask a TL or senior Sysadmin to help point you in the right direction.", 
             "title": "What is On-Call?"
         }, 
         {
             "location": "/oncall/being_oncall/#responsibilities", 
-            "text": "Prepare   Have your laptop and Internet with you (office, home, a MiFi dongle, a phone with a tethering plan, etc).  Have a way to charge your MiFi.    Team alert escalation happens within 5 minutes, set/stagger your notification timeouts (push, SMS, phone...) accordingly.  Make sure Spearhead Systems (and colleagues directly) texts and calls can bypass your \"Do Not Disturb\" settings.    Be prepared (environment is set up, you have remote access tools ready and functional, your credentials are current, you have Java installed, ssh-keys and so on...)  Read our Issue Response documentation (that's this!) to understand how we handle incidents and service requests, what the different roles and methods of communication are, etc.  Be aware of your upcoming on-call time (primary, backup) and arrange swaps around travel, vacations, appointments etc.     Triage   Acknowledge and act on alerts whenever you can (see the first \"Not responsibilities\" point below)  Determine the urgency of the problem:  Is it something that should be worked on right now or escalated into a major incident? (\"production server on fire\" situations. Security alerts) - do so.  Is it some tactical work that doesn't have to happen during the night? (for example, disk utilization high watermark, but there's plenty of space left and the trend is not indicating impending doom) - snooze the alert until a more suitable time (working hours, the next morning...) and get back to fixing it then.    Check Slack for current activity. Often (but not always) actions that could potentially cause alerts will be announced there.  Does the alert and your initial investigation indicate a general problem or an issue with a specific service that the relevant team should look into? If it does not look like a problem you are the expert for, then escalate to another team member or group.     Fix   You are empowered to dive into any problem and act to fix it.  Involve other team members as necessary: do not hesitate to escalate if you cannot figure out the cause within a reasonable timeframe or if the service / alert is something you have not tackled before.  If the issue is not very time sensitive and you have other priority work, make a note of this in DoIT to keep a track of it (with an appropriate severity and due date).     Improve   If a particular issue keeps happening; if an issue alerts often but turns out to be a preventable non-issue \u2013 perhaps improving this should be a longer-term task.  Disks that fill up, logs that should be rotated, noisy alerts...(we use ansible, go ahead and start automating!)    If information is difficult / impossible to find, write it down. Constantly refactor and improve our knowledge base and documentation. Add redundant links and pointers if your mental model of the wiki / codebase does not match the way it is currently organized.     Support   When your on-call \"shift\" ends, let the next on-call and team know about issues that have not been resolved yet and other experiences of note.  Make an effort to cleanly handover necessary information. We use Slack, email and DoIT to communicate.   This is a best-practice that should be applied whenever there are details that by sharong would benefit the efficiency of the team.    If you are making a change that impacts the schedule (adding / removing yourself, for example), let others know since many of us make arrangements around the on-call schedule well in advance.  Support each other: when doing activities that might generate plenty of pages, it is courteous to \"take the page\" away from the on-call by notifying them and scheduling an override for the duration.", 
+            "text": "Prepare   Have your laptop and Internet with you (office, home, a phone with a tethering plan, etc).  Have a way to charge your phone.    Team alert escalation happens within 30 minutes, set/stagger your notification timeouts (push, SMS, phone...) accordingly.  Make sure Spearhead Systems (and colleagues directly) texts and calls can bypass your \"Do Not Disturb\" settings.    Be prepared (environment is set up, you have remote access tools ready and functional, your credentials are current, you have Java installed, ssh-keys and so on...)  Read our Issue Response documentation (that's this!) to understand how we handle incidents and service requests, what the different roles and methods of communication are, etc.  Be aware of your upcoming on-call time (primary, backup) and arrange swaps around travel, vacations, appointments etc.     Triage   Acknowledge and act on alerts whenever you can (see the first \"Not responsibilities\" point below)  Determine the urgency of the problem:  Is it something that should be worked on right now or escalated into a major incident? (\"production server on fire\" situations. Security alerts) - do so.  Is it some tactical work that doesn't have to happen during the night? (for example, disk utilization high watermark, but there's plenty of space left and the trend is not indicating impending doom) - snooze the issue until a more suitable time (working hours, the next morning...) and get back to fixing it then.    Check our  internal Chat  for current activity. Often (but not always) actions that could potentially cause alerts will be announced there.  Does the alert and your initial investigation indicate a general problem or an issue with a specific service that the relevant team should look into? If it does not look like a problem you are the expert for, then escalate to another team member or group.     Fix   You are empowered to dive into any problem and act to fix it.  Involve other team members as necessary: do not hesitate to escalate if you cannot figure out the cause within a reasonable timeframe or if the service / alert is something you have not tackled before.  If the issue is not very time sensitive and you have other priority work, make a note of this in DoIT to keep a track of it (with an appropriate severity, comment and due date).     Improve   If a particular issue keeps happening; if an issue alerts often but turns out to be a preventable non-issue \u2013 perhaps improving this should be a longer-term task.  Disks that fill up, logs that should be rotated, noisy alerts...(we use ansible and rundeck, go ahead and start automating!)  When we perform a DoD (definition of done) this is good time to bring up recurring issues for discussion.    If information is difficult / impossible to find, write it down. Constantly refactor and improve our knowledge base and documentation. Add redundant links and pointers if your mental model of the wiki / codebase does not match the way it is currently organized.     Support   When your on-call \"shift\" ends, let the next on-call and team know about issues that have not been resolved yet and other experiences of note.  Make an effort to cleanly handover necessary information. We use  internal Chat , email and DoIT to communicate.   This is a best-practice that should be applied whenever there are details that by sharing would benefit the efficiency of the team.    If you are making a change that impacts the schedule (adding / removing yourself, for example), let others know since many of us make arrangements around the on-call schedule well in advance.  Support each other: when doing activities that might generate plenty of alerts, it is courteous to \"place the service/host in maintenance\" and take it away from the on-call by notifying them and scheduling an override for the duration.", 
             "title": "Responsibilities"
         }, 
         {
@@ -57,22 +57,22 @@
         }, 
         {
             "location": "/oncall/being_oncall/#recommendations", 
-            "text": "Always have a backup schedule. Yes, this means two people being on-call at the same time, however it takes a lot of the stress off of the primary if they know they have a specific backup they can contact, rather than trying to chose a random member of the team.     The third-level of your escalation (after backup schedule) should probably be your entire team. This should hopefully never happen, but when it does, it's useful to be able to just get the next available person.       Team leaders (TL) can (and should) be part of your normal rotation. It gives a better insight into what has been going on.    New members of the team should shadow your on-call rotation during the first few weeks. They should get all alerts, and should follow along with what you are doing. (All new employees shadow the Support team for one week of on-call, but it's useful to have new team members shadow your team rotations also. Just not at the same time).    Our escalation timeout is set to 5 minutes. This is usually plenty of time for someone to acknowledge the incident if they're able to. If they're not able to within 5 minutes, then they're probably not in a good position to respond to the incident anyway.   Triggering an escalation is done automatically in most situations based on the type, priority and severity of the issue.     When going off-call, you should provide a quick summary to the next on-call about any issues that may come up during their shift. A service has been flapping, an issue is likely to re-occur, etc. If you want to be formal, this can be a written report via email, but generally a verbal summary is sufficient.", 
+            "text": "Always have a backup schedule. Yes, this means two people being on-call at the same time, however it takes a lot of the stress off of the primary if they know they have a specific backup they can contact, rather than trying to chose a random member of the team.     The third-level of your escalation (after backup schedule) should probably be your entire team. This should hopefully never happen, but when it does, it's useful to be able to just get the next available person.       Team leaders (TL) are a part of our normal rotation. It gives a better insight into what has been going on.    New members of the team should shadow your on-call rotation during the first few weeks. They should get all alerts, and should follow along with what you are doing. (All new employees shadow the Support team for one week of on-call, but it's useful to have new team members shadow your team rotations also.).      When going off-call, you should provide a quick summary to the next on-call about any issues that may come up during their shift. A service has been flapping, an issue is likely to re-occur, etc. If you want to be formal, this can be a written report via email, but generally a verbal summary during our morning stand-up is sufficient.", 
             "title": "Recommendations"
         }, 
         {
             "location": "/oncall/being_oncall/#notification-method-recommendations", 
-            "text": "You are free to set up your notification rules as you see fit, to match how you would like to best respond to incidents. If you're not sure how to configure them, the Support team has some recommendations,    Use Push Notification and Email as your first method of notification. Most of us have phones with us at all times, so this is a prudent first method and is usually sufficient. (DoIT is in the process of integratoin with SNS for push notifications)  Use Phone and/or SMS notification each minute after, until the escalation time. If Push didn't work, then it's likely you need something stronger, like a phone call. Keep calling every minute until it's too late. If you don't pick up by the 3rd time, then it's unlikely you are able to respond, and the incident will get escalated away from you.", 
+            "text": "You are free to set up your notification rules as you see fit, to match how you would like to best respond to incidents. If you're not sure how to configure them, the Support team has some recommendations,", 
             "title": "Notification Method Recommendations"
         }, 
         {
             "location": "/oncall/being_oncall/#etiquette", 
-            "text": "If the current on-call comes into the office at 12pm looking tired, it's not because they're lazy. They probably got paged in the night. Cut them some slack and be nice.    Don't acknowledge an incident out from under someone else. If you didn't get paged for the incident, then you shouldn't be acknowledging it. Add a comment with your notes instead.       If you are testing something, or performing an action that you know will cause a page (notification, alert), it's customary to \"take the pager\" for the time during which you will be testing. Notify the person on-call that you are taking the pager for the next hour while you test.    \"Never hesitate to escalate\" - Never feel ashamed to rope in someone else if you're not sure how to resolve an issue. Likewise, never look down on someone else if they ask you for help.    Always consider covering an hour or so of someone else's on-call time if they request it and you are able. We all have lives which might get in the way of on-call time, and one day it might be you who needs to swap their on-call time in order to have a night out with your friend from out of town.    If an issue comes up during your on-call shift for which you got paged, you are responsible for resolving it. Even if it takes 3 hours and there's only 1 hour left of your shift. You can hand over to the next on-call if they agree, but you should never assume that's possible.", 
+            "text": "If the current on-call comes into the office at 12pm looking tired, it's not because they're lazy. They probably got paged in the night. Cut them some slack and be nice.    Don't close or otherwise modify a card out from under someone else. If you didn't get that specific card assigned to you as owner or a watcher, then you shouldn't be modifying it. Add a comment with your notes instead in the monitoring system and in DoIT.       If you are testing something, or performing an action that you know will cause an alert from our monitoring or possibly may be identified as an issue by our customers, it's customary to \"place the host/service in downtime\" and announce all the involved parties, for the time during which you will be testing. Notify the person on-call so they are aware of your testing.    \"Never hesitate to escalate\" - Never feel ashamed to rope in someone else if you're not sure how to resolve an issue. Likewise, never look down on someone else if they ask you for help.    Always consider covering an hour or so of someone else's on-call time if they request it and you are able. We all have lives which might get in the way of on-call time, and one day it might be you who needs to swap their on-call time in order to have a night out with your friend from out of town.    If an issue comes up during your on-call shift for which you got called, you are responsible for resolving it. Even if it takes 3 hours and there's only 1 hour left of your shift. You can hand over to the next on-call if they agree, but you should never assume that's possible.", 
             "title": "Etiquette"
         }, 
         {
             "location": "/oncall/alerting_principles/", 
-            "text": "We manage how we get alerted based on many factors such as the customers contractual SLA, the urgency of their request or incident, etc.. \nan alert or notification is something which requires a human to perform an action\n. Based on the severity of the issue (service request or incident) we prioritize accordingly in \nDoIT\n.\n\n\n\n\nMajor Priority Alerts\n\n\nAnything that wakes up a human in the middle of the night should be \nimmediately human actionable\n. If it is none of those things, then we need to adjust the alert to not page at those times.\n\n\n\n\n\n\n\n\n\n\nPriority\n\n\nAlerts\n\n\nResponse\n\n\n\n\n\n\n\n\n\n\nMajor\n\n\nMajor-Priority Spearhead Alert 24/7/365.\n\n\nRequires \nimmediate human action\n.\n\n\n\n\n\n\nNormal\n\n\nNormal-Priority Alert during \nbusiness hours only\n.\n\n\nRequires human action that same working day.\n\n\n\n\n\n\nMinor\n\n\nMinor-Priority Alert 24/7/365.\n\n\nRequires human action at some point.\n\n\n\n\n\n\nNotification\n\n\nSuppressed Events. No response required.\n\n\nInformational only. We do not need these to clutter our ticketing or inboxes. If they are enabled they should be sent only to required/specific people, not groups.\n\n\n\n\n\n\n\n\nBoth IN and SR (incidents, service requests) share the same priorities. The actual response / resolution times vary and are based upon contractual agreements with the customer. These details (SLA) are available in DoIT on the organization page of the respective customer.\n\n\nIf you're setting up a new alert/notification, consider the chart above for how you want to alert people. Be mindful of not creating new high-priority alerts if they don't require an immediate response, for example.\n\n\n\n\nAlert Channels\n\n\nPresently we use email as the only notification method. This means keeping an eye on your email is essential!\nSMS and Push notifications are in the pipeline for DoIT.  \n\n\n\n\nExamples\n#\n\n\n\"Production service is failing for 75% of requests, automation is unable to resolve.\"_\n#\n\n\nThis would be a \nMajor\n priority IN, requiring immediate human action to resolve.\n\n\n\n\n\"A customer sends an email stating that \"Production server disk space is filling, expected to be full in 48 hours. Log rotation is insufficient to resolve.\"\n#\n\n\nThis would be a \nNormal\n priority SR, requiring human action soon, but not immediately.\n\n\n\n\n\"An SSL certificate is due to expire in one week.\"\n#\n\n\nThis would be a \nMinor\n priority SR, requiring human action some time soon.", 
+            "text": "We manage how we get alerted based on many factors such as the customers contractual SLA, the urgency of their request or incident, etc.. \nan alert or notification is something which requires a human to perform an action\n. Based on the severity of the issue (service request or incident) we prioritize accordingly in \nDoIT\n.\n\n\n\n\nMajor Priority Alerts\n\n\nAnything that wakes up a human in the middle of the night should be \nimmediately human actionable\n. If it is none of those things, then we need to adjust the alert to not bother us at those times.\n\n\n\n\n\n\n\n\n\n\nPriority\n\n\nAlerts\n\n\nResponse\n\n\n\n\n\n\n\n\n\n\nMajor\n\n\nMajor-Priority Spearhead Alert 24/7/365.\n\n\nRequires \nimmediate human action\n.\n\n\n\n\n\n\nNormal\n\n\nNormal-Priority Alert during \nbusiness hours only\n.\n\n\nRequires human action that same working day.\n\n\n\n\n\n\nMinor\n\n\nMinor-Priority Alert 24/7/365.\n\n\nRequires human action at some point.\n\n\n\n\n\n\nNotification\n\n\nSuppressed Events. No response required.\n\n\nInformational only. We do not need these to clutter our ticketing or inboxes. If they are enabled they should be sent only to required/specific people, not groups.\n\n\n\n\n\n\n\n\nBoth IN and SR (incidents, service requests) share the same priorities. The actual response / resolution times vary and are based upon contractual agreements with the customer. These details (SLA) are available in DoIT on the organization page.\n\n\nIf you're setting up a new alert/notification, consider the chart above for how you want to alert people. Be mindful of not creating new high-priority alerts if they don't require an immediate response, for example.\n\n\n\n\nAlert Channels\n\n\nPrimarily we use email as the notification/alert methods and all of our customers are encouraged to use this method. Secondly there is the DoIT customer portal which will send alerts to the on-call person(s) and escalate based on SLA/contractual agreements. Thirdly we use our centralized support telephone number and individual phones. This means keeping an eye on your email is essential!\n\n\nSMS and Push notifications are in the pipeline for DoIT.  \n\n\n\n\nExamples\n#\n\n\n\"Production service is failing for 75% of requests, automation is unable to resolve.\"_\n#\n\n\nThis would be a \nMajor\n priority IN, requiring immediate human action to resolve.\n\n\n\n\n\"A customer sends an email stating that \"Production server disk space is filling, expected to be full in 48 hours. Log rotation is insufficient to resolve.\"\n#\n\n\nThis would be a \nNormal\n priority SR, requiring human action soon, but not immediately.\n\n\n\n\n\"An SSL certificate is due to expire in one week.\"\n#\n\n\nThis would be a \nMinor\n priority SR, requiring human action some time soon.", 
             "title": "Alerting Principles"
         }, 
         {
@@ -97,12 +97,12 @@
         }, 
         {
             "location": "/before/severity_levels/", 
-            "text": "The first step in any incident response process is to determine what actually constitutes an incident. We have two high level categories for classifying incidents: this is done using \"SR\" or \"IN\" defintions with an attached priority of \"Minor\", \"Normal\" or \"Major\". \"SR\" are \"Service requests\" initiated by a customer and usually do not constitute a critical issue (there are exceptions) while \"IN\" are \"incidents\" which are generally \"urgent\".\n\n\nAll of our operational issues are to be classified as either a Service Request or an Incident. Incidents have priority over Service Requests provided that there are no Service Requests with a higher priority. In general you will want to resolve a higher severity SR or IN than a lower one (a \"Major\" priority gets a more intensive response than a \"Normal\" incident for example).\n\n\n\n\nAlways Assume The Worst\n\n\nIf you are unsure which level an incident is (e.g. not sure if IN is Major or Normal), \ntreat it as the higher one\n. During an incident is not the time to discuss or litigate severities, just assume the highest and review during a post-mortem.\n\n\n\n\n\n  \n\n    \n\n      \nSeverity\n\n      \nDescription\n\n      \nWhat To Do\n\n    \n\n  \n\n  \n\n    \n\n      \nMajor\n\n      \n\n        \n\n          \nThe system is in a critical state and is actively impacting a large number of customers.\n\n          \nFunctionality has been severely impaired for a long time, breaking SLA.\n\n          \nCustomer-data-exposing security vulnerability has come to our attention.\n\n        \n\n      \n\n      \nSee \nDuring an Incident\n.\n\n    \n\n    \n\n      \nNormal\n\n      \n\n        \n\n          \nFunctionality of virtualization platform is severely impaired.\n\n          \nE-mail system is offline.\n\n        \n\n      \n\n      \nSee \nDuring an Incident\n.\n\n    \n\n    \n\n      \nAnything above this line is considered a \"Major Incident\". These are generally Incidents (IN). Below are service requests (SR) which are usually initiated by a human who can help with prioritizing. A call is triggered for all major incidents (indifferently of SR or IN).\n\n    \n\n    \n\n      \nNormal\n\n      \n\n        \n\n          \nPartial loss of functionality, only affecting minority of customers.\n\n          \nSomething that has the likelihood of becoming Major if nothing is done.\n\n          \nNo redundancy in a service (failure of 1 more node will cause outage).\n\n        \n\n      \n\n      \n\n        \n\n          \nWork on issue as your top priority.\n\n          \nLiaise with engineers of affected systems to identify cause.\n\n          \nIf related to recent deployment, rollback.\n\n          \nMonitor status and notice if/when it escalates.\n\n          \nMention on Slack if you think it has the potential to escalate.\n\n        \n\n      \n\n    \n\n    \n\n      \nNormal\n\n      \n\n        \n\n          \nPerformance issues (delays, etc). Tasks that require non-immediate attention.\n\n          \nJob failure (not impacting alerting).\n\n        \n\n      \n\n      \n\n        \n\n          \nWork on the issue as your first priority (above \"Low\" tasks).\n\n          \nMonitor status and notice if/when it escalates.\n\n        \n\n      \n\n    \n\n    \n\n      \nLow\n\n      \n\n        \n\n          \nNormal issues which aren't impacting system use, cosmetic issues, etc.\n\n        \n\n      \n\n      \n\n        \n\n          \nCreate a DoIT card and assign to owner of affected system.\n\n        \n\n      \n\n    \n\n  \n\n\n\n\n\n\n\nBe Specific\n\n\nWhen creating Cards in Doit, be as specific as possible and include all necessary details. Include relevant details regarding when the issue started, what may have triggered it, etc.. Document your efforts through worklogs and be specific there as well.", 
+            "text": "The first step in any incident response process is to determine what actually constitutes an incident. We have two high level categories for classifying incidents: these are \"SR\" or \"IN\" defintions with an attached priority of \"Minor\", \"Normal\" or \"Major\". \"SR\" are \"Service requests\" initiated by a customer and usually do not constitute a critical issue (there are exceptions) while \"IN\" are \"incidents\" which are generally \"urgent\".\n\n\nAll issues reported to Spearhead are to be classified as either a Service Request or an Incident. Incidents have priority over Service Requests provided that there are no Service Requests with a higher priority. In general you will want to resolve a higher severity SR or IN than a lower one (a \"Major\" priority gets a more intensive response than a \"Normal\" incident for example).\n\n\n\n\nAlways Assume The Worst\n\n\nIf you are unsure which level an incident is (e.g. not sure if IN is Major or Normal), \ntreat it as the higher one\n. During an incident is not the time to discuss or litigate severities, just assume the highest and review during a post-mortem.\n\n\n\n\n\n  \n\n    \n\n      \nSeverity\n\n      \nDescription\n\n      \nWhat To Do\n\n    \n\n  \n\n  \n\n    \n\n      \nMajor\n\n      \n\n        \n\n          \nThe system is in a critical state and is actively impacting a large number of customers.\n\n          \nFunctionality has been severely impaired for a long time, breaking SLA.\n\n          \nCustomer-data-exposing security vulnerability has come to our attention.\n\n        \n\n      \n\n      \nSee \nDuring an Incident\n.\n\n    \n\n    \n\n      \nMajor\n\n      \n\n        \n\n          \nFunctionality of virtualization platform is severely impaired.\n\n          \nE-mail system is offline.\n\n        \n\n      \n\n      \nSee \nDuring an Incident\n.\n\n    \n\n\n    \n\n      \nNormal\n\n      \n\n        \n\n          \nPartial loss of functionality, only affecting minority of customers.\n\n          \nSomething that has the likelihood of becoming Major if nothing is done.\n\n          \nNo redundancy in a service (failure of 1 more node will cause outage).\n\n        \n\n      \n\n      \n\n        \n\n          \nWork on issue as your top priority.\n\n          \nLiaise with engineers of affected systems to identify cause.\n\n          \nIf related to recent deployment, rollback.\n\n          \nMonitor status and notice if/when it escalates.\n\n          \nMention on Slack if you think it has the potential to escalate.\n\n        \n\n      \n\n    \n\n    \n\n      \nNormal\n\n      \n\n        \n\n          \nPerformance issues (delays, etc). Tasks that require non-immediate attention.\n\n          \nJob failure (not impacting alerting).\n\n        \n\n      \n\n      \n\n        \n\n          \nWork on the issue as your first priority (above \"Low\" tasks).\n\n          \nMonitor status and notice if/when it escalates.\n\n        \n\n      \n\n    \n\n    \n\n      \nLow\n\n      \n\n        \n\n          \nNormal issues which aren't impacting system use, cosmetic issues, etc.\n\n        \n\n      \n\n      \n\n        \n\n          \nCreate a DoIT card and assign to owner of affected system.\n\n        \n\n      \n\n    \n\n  \n\n\n\n\n\n\n\nBe Specific\n\n\nWhen creating Cards in Doit, be as specific as possible and include all necessary details. Include relevant details regarding when the issue started, what may have triggered it, etc.. Document your efforts through worklogs and be specific there as well.", 
             "title": "Severity Levels"
         }, 
         {
             "location": "/before/different_roles/", 
-            "text": "Our support services are deliviered via a flat organizational structure. The same people that deliver projects are also there to deliver ongoing support/maintenance services.\nThere are several roles in our support team at Spearhead Systems. Certain roles only have one person per incident (e.g. sysadmin), whereas other roles can have multiple people (e.g. Sysadmins, Solution Architects, etc.). It's all about coming together as a team, working the problem, and getting a solution quickly.\n\n\nHere is a rough outline of our role hierarchy, with each role discussed in more detail on the rest of this page.\n\n\n\n\n\n\nTeam Leader (TL)\n#\n\n\nWhat is it?\n#\n\n\nA Team Leader acts as the single source of truth of what is currently happening and what is going to happen during an major incident. They come in all shapes, sizes, and colors. TL's are also the key elements in a project (boards in DoIT).\n\n\nWhy have one?\n#\n\n\nAs any system grows in size and complexity, things break and cause incidents. The TL is needed to help drive major incidents to resolution by organizing his team towards a common goal. A TL's skillset includes project and resource management skills which are essential in driving both projects and incidents to a smooth resolution.\n\n\nWhat are the responsibilities?\n#\n\n\n\n\nHelp prepare for projects and incidents,\n\n\nSetup communications channels.\n\n\nCreate the DoIT board(s) and other project planning related materials.\n\n\nFunnel people to these communications channels.\n\n\nTrain team members on how to communicate and train other TL's.\n\n\n\n\n\n\nDrive incidents and projects to resolution,\n\n\nGet everyone on the same communication channel.\n\n\nCollect information from team members for their services/area of ownership status.\n\n\nCollect proposed repair actions, then recommend repair actions to be taken.\n\n\nDelegate all repair actions, the TL is NOT a resolver.\n\n\nBe the single authority on system status\n\n\nCommunicate directly with the customers and end-users\n\n\nnot the engineers themselves!\n\n\n\n\n\n\n\n\n\n\nPost Mortem,\n\n\nCreating the initial template right after the incident so people can put in their thoughts while fresh.\n\n\nAssigning the post-mortem after the event is over, this can be done after the call.\n\n\nWork with Managers/Support on scheduling preventive actions.\n\n\n\n\n\n\n\n\nWho are they?\n#\n\n\nAnyone on the TL on-call schedule. Trainees are typically on the TL Shadow schedule.\n\n\nHow can I become one?\n#\n\n\nTake a look at our \nTeam Leader training guide\n.\n\n\n\n\nSysadmin\n#\n\n\nWhat is it?\n#\n\n\nA Sysadmin is a direct support role for the Team Leader. This is not a shadow where the person just observes, the Sysadmin is expected to perform important tasks during an incident.\n\n\nWhy have one?\n#\n\n\nIt's important for the TL to focus on the problem at hand, rather than worrying about documenting the steps or monitoring timers. The Sysadmin helps to support the TL and keep them stay focussed on the incident.\n\n\nWhat are the responsibilities?\n#\n\n\nThe Sysadmin is expected to:\n\n\n\n\nBring up issues to the TL that may otherwise not be addressed (keeping an eye on timers that have been started, circling back around to missed items from a roll call, etc).\n\n\nBe a \"hot standby\" TL, should the primary need to either transition to a SME, or otherwise have to step away from the TL role.\n\n\nPage SME's or other on-call engineers as instructed by the Team Leader.\n\n\nManage the incident call, and be prepared to remove people from the call if instructed by the Team Leader.\n\n\nLiaise with stakeholders and provide status updates on DoIT (using worklogs and comments), Slack and email/telefone as necessary.\n\n\n\n\nWho are they?\n#\n\n\nAny Team Leader can act as a Sysadmin. Sysadmins need to be trained as an Team Leader as they may be required to take over command.\n\n\nHow can I become one?\n#\n\n\nTake a look at our \nSysadmin training guide\n. Sysadmins also need to be \ntrained as an Team Leaders\n.\n\n\n\n\nScribe\n#\n\n\nWhat is it?\n#\n\n\nA Scribe documents the timeline of an incident as it progresses, and makes sure all important decisions and data are captured for later review. We will not have a dedicated Scibe in all situations therefore a junior will take on this role. This is an essential role as all Juniors are expectd to grow into other areas and take on more responsibilities as they evolve.\n\n\nWhy have one?\n#\n\n\nThe Team Leader will need to focus on the problem at hand, and the sysadmins and subject matter experts will need to focus on resolving the incident. It is important to capture a timeline of events as they happen so that they can be reviewed during the post-mortem to determine how well we performed, and so we can accurate determine any additional impact that we might not have noticed at the time.\n\n\nWhat are the responsibilities?\n#\n\n\nThe Scribe is expected to:\n\n\n\n\nEnsure the incident call is being recorded.\n\n\nNote in DoIT, Slack, etc. important data, events, and actions, as they happen. Specifically:\n\n\nKey actions as they are taken (Example: \"prod-server-387723 is being restarted to attempt to remove the stuck lock\")\n\n\nStatus reports when one is provided by the TL (Example: \"We are in IN-Major, service A is currently not processing events due to a stuck lock, X is restarting the app stack, next checkin in 3 minutes\")\n\n\nAny key callouts either during the call or at the ending review (Example: \"Note: (Bob B) We should have a better way to determine stuck locks.\")\n\n\n\n\n\n\n\n\nWho are they?\n#\n\n\nAnyone can act as a Sribe during an incident, and are chosen by the Team Leader at the start of the call. Typically the Sysadmin will act as the Scribe, but that doesn't necessarily need to happen, and for larger incidents may not be possible.\n\n\nHow can I become one?\n#\n\n\nFollow our \nScribe training guide\n, and then notify the Team Leaders that you would like to be considered for scribing for the next incident.\n\n\n\n\nSubject Matter Expert\n#\n\n\nWhat is it?\n#\n\n\nA Subject Matter Expert (SME), sometimes called a \"Resolver\" or \"Architect\", is a domain expert or designated owner of a component or service that is part of the Spearhead Systems service delivery concept.\n\n\nWhy have one?\n#\n\n\nThe TL and Sysadmins are not all-knowing super beings. When there is a problem with a service or a particular system, an expert in that service is needed to be able to quickly help identify and fix issues.\n\n\nWhat are the responsibilities?\n#\n\n\n\n\nBeing able to diagnose common problems with the service.\n\n\nBeing able to rapidly fix issues found during an incident.\n\n\nConcise communication skills, specifically for CAN reports:\n\n\nCondition: What is the current state of the service? Is it healthy or not?\n\n\nActions: What actions need to be taken if the service is not in a healthy state?\n\n\nNeeds: What support does the resolver need to perform an action?\n\n\n\n\n\n\n\n\nWho are they?\n#\n\n\nAnyone who is considered a \"domain expert\" can act as a resolver for an incident. Typically the service's primary on-call will act as the SME for that service.\n\n\nHow can I become one?\n#\n\n\nTake a look at our \nSubject Matter Expert training guide\n. You should also discuss with your team and service owner to determine what the requirements are for your particular service.\n\n\n\n\nCustomer Liaison\n#\n\n\nWhat is it?\n#\n\n\nA person responsible for interacting with customers, either directly, or via our public communication channels. This is typically the TL while in some situations another member of the Support Team or even Management may intervene and relay vital information to the customer.\n\n\nWhy have one?\n#\n\n\nAll of the other roles will be actively working on identifying the cause and resolving the issue, we need a role which is focused purely on the customer interaction side of things so that it can be done properly, with the due care and attention it needs.\n\n\nWhat are the responsibilities?\n#\n\n\n\n\nPost any publicly facing messages regarding the incident (DoIT, Twitter, etc).\n\n\nNotify the TL of any customers reporting that they are affected by the incident.\n\n\n\n\nWho are they?\n#\n\n\nAny member of the Support Team or Management (provided user has undergone trainig) can act as a customer liaison.\n\n\nHow can I become one?\n#\n\n\nDiscuss with the Support Team about becoming our next customer liaison.", 
+            "text": "Our support services are currently deliviered via a flat organizational structure. \nThere are however several roles in our support team at Spearhead Systems. Certain roles only have one person per incident (e.g. sysadmin), whereas other roles can have multiple people (e.g. Sysadmins, Solution Architects, etc.). It's all about coming together as a team, working the problem, and getting a solution quickly.\n\n\nHere is a rough outline of our role hierarchy, with each role discussed in more detail on the rest of this page.\n\n\n\n\n\n\nTeam Leader (TL)\n#\n\n\nWhat is it?\n#\n\n\nA Team Leader acts as the single source of truth of what is currently happening and what is going to happen during an major incident and general ongoing support. They come in all shapes, sizes, and colors. TL's are also the key elements in a project (boards in DoIT).\n\n\nWhy have one?\n#\n\n\nAs any system grows in size and complexity, things break and cause incidents. The TL is needed to help drive major incidents to resolution by organizing his team towards a common goal. A TL's skillset includes project and resource management skills which are essential in driving both projects and incidents to a smooth resolution.\n\n\nWhat are the responsibilities?\n#\n\n\n\n\nHelp prepare for projects and incidents,\n\n\nSetup communications channels.\n\n\nCreate the DoIT board(s) and other project planning related materials.\n\n\nFunnel people to these communications channels.\n\n\nTrain team members on how to communicate and train other TL's.\n\n\nTrain team members and help them prepare with the proper know-how/tools to deliver the project.\n\n\n\n\n\n\nDrive incidents and projects to resolution,\n\n\nGet everyone on the same communication channel.\n\n\nCollect information from team members for their services/area of ownership status.\n\n\nCollect proposed repair actions, then recommend repair actions to be taken.\n\n\nDelegate all repair actions, the TL is NOT a resolver.\n\n\nBe the single authority on system status\n\n\nCommunicate directly with the customers and end-users\n\n\nnot the engineers themselves!\n\n\n\n\n\n\n\n\n\n\nPost Mortem,\n\n\nCreating the initial template right after the incident so people can put in their thoughts while fresh.\n\n\nAssigning the post-mortem after the event is over, this can be done after the call.\n\n\nWork with Managers/Support on scheduling preventive actions.\n\n\n\n\n\n\n\n\nWho are they?\n#\n\n\nAnyone on the on-call schedule is a TL durin his shift. Trainees are typically on the TL Shadow schedule.\n\n\nHow can I become one?\n#\n\n\nTake a look at our \nTeam Leader training guide\n.\n\n\n\n\nSysadmin\n#\n\n\nWhat is it?\n#\n\n\nA Sysadmin is a direct support role for the Team Leader. This is not a shadow where the person just observes, the Sysadmin is expected to perform important tasks during an incident.\n\n\nWhy have one?\n#\n\n\nIt's important for the TL to focus on the problem at hand, rather than worrying about documenting the steps or monitoring timers. The Sysadmin helps to support the TL and help them stay focussed on the incident.\n\n\nWhat are the responsibilities?\n#\n\n\nThe Sysadmin is expected to:\n\n\n\n\nBring up issues to the TL that may otherwise not be addressed (keeping an eye on timers that have been started, circling back around to missed items from a roll call, etc).\n\n\nBe a \"hot standby\" TL, should the primary need to either transition to a SME, or otherwise have to step away from the TL role.\n\n\nCall SME's or other on-call engineers as instructed by the Team Leader.\n\n\nManage the incident call, and be prepared to remove people from the call if instructed by the Team Leader.\n\n\nLiaise with stakeholders and provide status updates on DoIT (using worklogs and comments), internal Chat and email/telefone as necessary.\n\n\n\n\nWho are they?\n#\n\n\nAny Team Leader can act as a Sysadmin. Sysadmins need to be trained as an Team Leader as they may be required to take over command.\n\n\nHow can I become one?\n#\n\n\nTake a look at our \nSysadmin training guide\n. Sysadmins also need to be \ntrained as an Team Leaders\n.\n\n\n\n\nScribe\n#\n\n\nWhat is it?\n#\n\n\nA Scribe documents the timeline of an incident as it progresses, and makes sure all important decisions and data are captured for later review. We will not have a dedicated Scibe in all situations therefore a junior will take on this role. This is an essential role as all Juniors are expectd to grow into other areas and take on more responsibilities as they evolve.\n\n\nWhy have one?\n#\n\n\nThe Team Leader will need to focus on the problem at hand, and the sysadmins and subject matter experts will need to focus on resolving the incident. It is important to capture a timeline of events as they happen so that they can be reviewed during the post-mortem to determine how well we performed, and so we can accurate determine any additional impact that we might not have noticed at the time.\n\n\nWhat are the responsibilities?\n#\n\n\nThe Scribe is expected to:\n\n\n\n\nEnsure the incident call is being recorded.\n\n\nNote in DoIT, internal Chat, etc. important data, events, and actions, as they happen. Specifically:\n\n\nKey actions as they are taken (Example: \"prod-server-387723 is being restarted to attempt to remove the stuck lock\")\n\n\nStatus reports when one is provided by the TL (Example: \"We are in IN-Major, service A is currently not processing events due to a stuck lock, X is restarting the app stack, next checkin in 3 minutes\")\n\n\nAny key callouts either during the call or at the ending review (Example: \"Note: (Bob B) We should have a better way to determine stuck locks.\")\n\n\n\n\n\n\n\n\nWho are they?\n#\n\n\nAnyone can act as a Sribe during an incident, and are chosen by the Team Leader at the start of the call. Typically the Sysadmin will act as the Scribe, but that doesn't necessarily need to happen, and for larger incidents may not be possible.\n\n\nHow can I become one?\n#\n\n\nFollow our \nScribe training guide\n, and then notify the Team Leaders that you would like to be considered for scribing for the next incident.\n\n\n\n\nSubject Matter Expert\n#\n\n\nWhat is it?\n#\n\n\nA Subject Matter Expert (SME), sometimes called a \"Resolver\" or \"Architect\", is a domain expert or designated owner of a component or service that is part of the Spearhead Systems service delivery concept.\n\n\nWhy have one?\n#\n\n\nThe TL and Sysadmins are not all-knowing super beings. When there is a problem with a service or a particular system, an expert in that service is needed to be able to quickly help identify and fix issues.\n\n\nWhat are the responsibilities?\n#\n\n\n\n\nBeing able to diagnose common problems with the service.\n\n\nBeing able to rapidly fix issues found during an incident.\n\n\nConcise communication skills, specifically for CAN reports:\n\n\nCondition: What is the current state of the service? Is it healthy or not?\n\n\nActions: What actions need to be taken if the service is not in a healthy state?\n\n\nNeeds: What support does the resolver need to perform an action?\n\n\n\n\n\n\n\n\nWho are they?\n#\n\n\nAnyone who is considered a \"domain expert\" can act as a resolver for an incident. Typically the service's primary on-call will act as the SME for that service.\n\n\nHow can I become one?\n#\n\n\nTake a look at our \nSubject Matter Expert training guide\n. You should also discuss with your team and service owner to determine what the requirements are for your particular service.\n\n\n\n\nCustomer Liaison\n#\n\n\nWhat is it?\n#\n\n\nA person responsible for interacting with customers, either directly, or via our public communication channels. This is typically the TL while in some situations another member of the Support Team or even Management may intervene and relay vital information to the customer.\n\n\nWhy have one?\n#\n\n\nAll of the other roles will be actively working on identifying the cause and resolving the issue, we need a role which is focused purely on the customer interaction side of things so that it can be done properly, with the due care and attention it needs.\n\n\nWhat are the responsibilities?\n#\n\n\n\n\nPost any publicly facing messages regarding the incident (DoIT, Twitter, etc).\n\n\nNotify the TL of any customers reporting that they are affected by the incident.\n\n\n\n\nWho are they?\n#\n\n\nAny member of the Support Team or Management (provided user has undergone trainig) can act as a customer liaison.\n\n\nHow can I become one?\n#\n\n\nDiscuss with the Support Team about becoming our next customer liaison.", 
             "title": "Different Roles"
         }, 
         {
@@ -112,7 +112,7 @@
         }, 
         {
             "location": "/before/different_roles/#what-is-it", 
-            "text": "A Team Leader acts as the single source of truth of what is currently happening and what is going to happen during an major incident. They come in all shapes, sizes, and colors. TL's are also the key elements in a project (boards in DoIT).", 
+            "text": "A Team Leader acts as the single source of truth of what is currently happening and what is going to happen during an major incident and general ongoing support. They come in all shapes, sizes, and colors. TL's are also the key elements in a project (boards in DoIT).", 
             "title": "What is it?"
         }, 
         {
@@ -122,12 +122,12 @@
         }, 
         {
             "location": "/before/different_roles/#what-are-the-responsibilities", 
-            "text": "Help prepare for projects and incidents,  Setup communications channels.  Create the DoIT board(s) and other project planning related materials.  Funnel people to these communications channels.  Train team members on how to communicate and train other TL's.    Drive incidents and projects to resolution,  Get everyone on the same communication channel.  Collect information from team members for their services/area of ownership status.  Collect proposed repair actions, then recommend repair actions to be taken.  Delegate all repair actions, the TL is NOT a resolver.  Be the single authority on system status  Communicate directly with the customers and end-users  not the engineers themselves!      Post Mortem,  Creating the initial template right after the incident so people can put in their thoughts while fresh.  Assigning the post-mortem after the event is over, this can be done after the call.  Work with Managers/Support on scheduling preventive actions.", 
+            "text": "Help prepare for projects and incidents,  Setup communications channels.  Create the DoIT board(s) and other project planning related materials.  Funnel people to these communications channels.  Train team members on how to communicate and train other TL's.  Train team members and help them prepare with the proper know-how/tools to deliver the project.    Drive incidents and projects to resolution,  Get everyone on the same communication channel.  Collect information from team members for their services/area of ownership status.  Collect proposed repair actions, then recommend repair actions to be taken.  Delegate all repair actions, the TL is NOT a resolver.  Be the single authority on system status  Communicate directly with the customers and end-users  not the engineers themselves!      Post Mortem,  Creating the initial template right after the incident so people can put in their thoughts while fresh.  Assigning the post-mortem after the event is over, this can be done after the call.  Work with Managers/Support on scheduling preventive actions.", 
             "title": "What are the responsibilities?"
         }, 
         {
             "location": "/before/different_roles/#who-are-they", 
-            "text": "Anyone on the TL on-call schedule. Trainees are typically on the TL Shadow schedule.", 
+            "text": "Anyone on the on-call schedule is a TL durin his shift. Trainees are typically on the TL Shadow schedule.", 
             "title": "Who are they?"
         }, 
         {
@@ -147,12 +147,12 @@
         }, 
         {
             "location": "/before/different_roles/#why-have-one_1", 
-            "text": "It's important for the TL to focus on the problem at hand, rather than worrying about documenting the steps or monitoring timers. The Sysadmin helps to support the TL and keep them stay focussed on the incident.", 
+            "text": "It's important for the TL to focus on the problem at hand, rather than worrying about documenting the steps or monitoring timers. The Sysadmin helps to support the TL and help them stay focussed on the incident.", 
             "title": "Why have one?"
         }, 
         {
             "location": "/before/different_roles/#what-are-the-responsibilities_1", 
-            "text": "The Sysadmin is expected to:   Bring up issues to the TL that may otherwise not be addressed (keeping an eye on timers that have been started, circling back around to missed items from a roll call, etc).  Be a \"hot standby\" TL, should the primary need to either transition to a SME, or otherwise have to step away from the TL role.  Page SME's or other on-call engineers as instructed by the Team Leader.  Manage the incident call, and be prepared to remove people from the call if instructed by the Team Leader.  Liaise with stakeholders and provide status updates on DoIT (using worklogs and comments), Slack and email/telefone as necessary.", 
+            "text": "The Sysadmin is expected to:   Bring up issues to the TL that may otherwise not be addressed (keeping an eye on timers that have been started, circling back around to missed items from a roll call, etc).  Be a \"hot standby\" TL, should the primary need to either transition to a SME, or otherwise have to step away from the TL role.  Call SME's or other on-call engineers as instructed by the Team Leader.  Manage the incident call, and be prepared to remove people from the call if instructed by the Team Leader.  Liaise with stakeholders and provide status updates on DoIT (using worklogs and comments), internal Chat and email/telefone as necessary.", 
             "title": "What are the responsibilities?"
         }, 
         {
@@ -182,7 +182,7 @@
         }, 
         {
             "location": "/before/different_roles/#what-are-the-responsibilities_2", 
-            "text": "The Scribe is expected to:   Ensure the incident call is being recorded.  Note in DoIT, Slack, etc. important data, events, and actions, as they happen. Specifically:  Key actions as they are taken (Example: \"prod-server-387723 is being restarted to attempt to remove the stuck lock\")  Status reports when one is provided by the TL (Example: \"We are in IN-Major, service A is currently not processing events due to a stuck lock, X is restarting the app stack, next checkin in 3 minutes\")  Any key callouts either during the call or at the ending review (Example: \"Note: (Bob B) We should have a better way to determine stuck locks.\")", 
+            "text": "The Scribe is expected to:   Ensure the incident call is being recorded.  Note in DoIT, internal Chat, etc. important data, events, and actions, as they happen. Specifically:  Key actions as they are taken (Example: \"prod-server-387723 is being restarted to attempt to remove the stuck lock\")  Status reports when one is provided by the TL (Example: \"We are in IN-Major, service A is currently not processing events due to a stuck lock, X is restarting the app stack, next checkin in 3 minutes\")  Any key callouts either during the call or at the ending review (Example: \"Note: (Bob B) We should have a better way to determine stuck locks.\")", 
             "title": "What are the responsibilities?"
         }, 
         {
@@ -257,13 +257,13 @@
         }, 
         {
             "location": "/before/call_etiquette/", 
-            "text": "You've just joined Spearhead Systems support staff and you've never worked in a service delivery function before. You have no idea what an incident or a service request is. You have no idea what's going on, or what you're supposed to be doing. This page will help you through your first time and will provide a reference for future issues you may be a part of.\n\n\n\n\nCredit: \nOfficial White House Photo\n by Pete Souza\n\n\nFirst Steps\n#\n\n\n\n\nIf you intend on participating on the incident call you should join both the call, review the associated cards in DoIT, and jump on the corresponding Slack channel.\n\n\nMake sure you are in a quiet environment in order to participate on the call. Background noise should be kept to a minimum.\n\n\nKeep your microphone muted until you have something to say.\n\n\nIdentify yourself when you join the call; State your name and the system you are the expert for.\n\n\nSpeak up and speak clearly.\n\n\nBe direct and factual.\n\n\nKeep conversations/discussions short and to the point.\n\n\nBring any concerns to the Team Leader (IC) on the call.\n\n\nRespect time constraints given by the Team Leader.\n\n\n\n\n\n\nIncident Call\n\n\nNot all issues start with an incident call. Some issues may be completely automated and available only in DoIT while others may be in the incipient stages and the customer may still be on the phone/Slack detailing their issue.\n\n\n\n\nLingo\n#\n\n\nUse clear terminology, and avoid using acronyms or abbreviations during a call. Clear and accurate communication is more important than quick communication.\n\n\n\n\nStandard radio \nvoice procedure\n does not need to be followed on calls. However, you should familiarize yourself with the terms, as you may hear them on a call (or need to use them yourself). The ones in more active use on major incident calls are,\n\n\n\n\nAck/Rog\n - \"I have received and understood\"\n\n\nSay Again\n - \"Repeat your last message\"\n\n\nStandby\n - \"Please wait a moment for the next response\"\n\n\nWilco\n - \"Will comply\"\n\n\n\n\nDo not invent new abbreviations, and always favor being explicit of implicit. It is better to make things clearer than to try and save time by abbreviating, only to have a misunderstanding because others didn't know the abbreviation.\n\n\nThe Team Leader\n#\n\n\nThe Team Leader (TL) is the leader of the incident response process, and is responsible for bringing the incident to resolution. They will announce themselves at the start of the call, and will generally be doing most of the talking.\n\n\n\n\nFollow all instructions from the team leader, without exception.\n\n\nDo not perform any actions unless the team leader has told you to do so.\n\n\nThe team leader will typically poll for any strong objections before performing a large action. This is your time to raise any objections if you have them.\n\n\nOnce the team leader has made a decision, that decision is final and should be followed, even if you disagreed during the poll.\n\n\nAnswer any questions the team leader asks you in a clear and concise way.\n\n\nAnswering that you \"don't know\" something is perfectly acceptable. Do not try to guess.\n\n\n\n\n\n\nThe team leader may ask you to investigate something and get back to them in X minutes. Make sure you are ready with an answer within that time.\n\n\nAnswering that you need more time is perfectly acceptable, but you need to give the team leader an estimate of how much time.\n\n\n\n\n\n\n\n\nProblems?\n#\n\n\nThere's no team leader on the call! I don't know what to do!\n#\n\n\nAsk on the call if an TL is present. If you have no response, try asking in Slack. If there is no TL the sysadmin can take over this role temporarily.\n\n\nThere is not enough information!\n#\n\n\nThe definitive source of information for all issues is in DoIT. If at any point there is a discrepancy ask the TL or Sysadmins to provide up to date information and update the card/tasks accordingly. At a minimum information should be available in Slack.\n\n\nI can join the call or Slack, but not both, what should I do?\n#\n\n\nYou're welcome to join only one of the channels, however you should not actively participate in the incident response if so, as it causes disjoined communication. Liaise with someone who is both in Slack and on the call to provide any input you may have so that they can raise it.", 
+            "text": "You've just joined Spearhead Systems support staff and you've never worked in a service delivery function before. You have no idea what an incident or a service request is. You have no idea what's going on, or what you're supposed to be doing. This page will help you through your first time and will provide a reference for future issues you may be a part of.\n\n\n\n\nCredit: \nOfficial White House Photo\n by Pete Souza\n\n\nFirst Steps regarding Incidents\n#\n\n\n\n\nIf you intend on participating on the incident call you should join both the call (if there is a call), review the associated cards in DoIT, and jump on the corresponding internal Chat channel.\n\n\nMake sure you are in a quiet environment in order to participate on the call. Background noise should be kept to a minimum.\n\n\nKeep your microphone muted until you have something to say.\n\n\nIdentify yourself when you join the call; State your name and the system you are the expert for.\n\n\nSpeak up and speak clearly.\n\n\nBe direct and factual.\n\n\nKeep conversations/discussions short and to the point.\n\n\nBring any concerns to the Team Leader (TL) on the call.\n\n\nRespect time constraints given by the Team Leader.\n\n\n\n\n\n\nIncident Call\n\n\nNot all issues start with an incident call. Some issues may be completely automated and available only in DoIT while others may be in the incipient stages and the customer may still be on the phone/internal Chat detailing their issue.\n\n\n\n\nLingo\n#\n\n\nUse clear terminology, and avoid using acronyms or abbreviations during a call. Clear and accurate communication is more important than quick communication.\n\n\n\n\nStandard radio \nvoice procedure\n does not need to be followed on calls. However, you should familiarize yourself with the terms, as you may hear them on a call (or need to use them yourself). The ones in more active use on major incident calls are,\n\n\n\n\nAck/Rog\n - \"I have received and understood\"\n\n\nSay Again\n - \"Repeat your last message\"\n\n\nStandby\n - \"Please wait a moment for the next response\"\n\n\nWilco\n - \"Will comply\"\n\n\n\n\nDo not invent new abbreviations, and always favor being explicit of implicit. It is better to make things clearer than to try and save time by abbreviating, only to have a misunderstanding because others didn't know the abbreviation.\n\n\nThe Team Leader\n#\n\n\nThe Team Leader (TL) is the leader of the incident response process, and is responsible for bringing the incident to resolution. They will announce themselves at the start of the call, and will generally be doing most of the talking.\n\n\n\n\nTL is not available\n\n\nA TL may not be available in which case the incident call will be guided by the senior Sysadmin or SME available.\n\n\n\n\n\n\nFollow all instructions from the team leader, without exception.\n\n\nDo not perform any actions unless the team leader has told you to do so.\n\n\nThe team leader will typically poll for any strong objections before performing a large action. This is your time to raise any objections if you have them.\n\n\nOnce the team leader has made a decision, that decision is final and should be followed, even if you disagreed during the poll.\n\n\nAnswer any questions the team leader asks you in a clear and concise way.\n\n\nAnswering that you \"don't know\" something is perfectly acceptable. Do not try to guess.\n\n\n\n\n\n\nThe team leader may ask you to investigate something and get back to them in X minutes. Make sure you are ready with an answer within that time.\n\n\nAnswering that you need more time is perfectly acceptable, but you need to give the team leader an estimate of how much time.\n\n\n\n\n\n\n\n\nProblems?\n#\n\n\nThere's no team leader on the call! I don't know what to do!\n#\n\n\nAsk on the call if an TL is present. If you have no response, try asking in our internal Chat. If there is no TL the sysadmin can take over this role temporarily.\n\n\nThere is not enough information!\n#\n\n\nThe definitive source of information for all issues is in DoIT. If it is lacking there then you need to make a note of it and make sure that whoever created the card understands the importance of complete information in a timely manner. If at any point there is a discrepancy ask the TL or Sysadmins to provide up to date information and update the card/tasks accordingly.", 
             "title": "Call Etiquette"
         }, 
         {
-            "location": "/before/call_etiquette/#first-steps", 
-            "text": "If you intend on participating on the incident call you should join both the call, review the associated cards in DoIT, and jump on the corresponding Slack channel.  Make sure you are in a quiet environment in order to participate on the call. Background noise should be kept to a minimum.  Keep your microphone muted until you have something to say.  Identify yourself when you join the call; State your name and the system you are the expert for.  Speak up and speak clearly.  Be direct and factual.  Keep conversations/discussions short and to the point.  Bring any concerns to the Team Leader (IC) on the call.  Respect time constraints given by the Team Leader.    Incident Call  Not all issues start with an incident call. Some issues may be completely automated and available only in DoIT while others may be in the incipient stages and the customer may still be on the phone/Slack detailing their issue.", 
-            "title": "First Steps"
+            "location": "/before/call_etiquette/#first-steps-regarding-incidents", 
+            "text": "If you intend on participating on the incident call you should join both the call (if there is a call), review the associated cards in DoIT, and jump on the corresponding internal Chat channel.  Make sure you are in a quiet environment in order to participate on the call. Background noise should be kept to a minimum.  Keep your microphone muted until you have something to say.  Identify yourself when you join the call; State your name and the system you are the expert for.  Speak up and speak clearly.  Be direct and factual.  Keep conversations/discussions short and to the point.  Bring any concerns to the Team Leader (TL) on the call.  Respect time constraints given by the Team Leader.    Incident Call  Not all issues start with an incident call. Some issues may be completely automated and available only in DoIT while others may be in the incipient stages and the customer may still be on the phone/internal Chat detailing their issue.", 
+            "title": "First Steps regarding Incidents"
         }, 
         {
             "location": "/before/call_etiquette/#lingo", 
@@ -272,7 +272,7 @@
         }, 
         {
             "location": "/before/call_etiquette/#the-team-leader", 
-            "text": "The Team Leader (TL) is the leader of the incident response process, and is responsible for bringing the incident to resolution. They will announce themselves at the start of the call, and will generally be doing most of the talking.   Follow all instructions from the team leader, without exception.  Do not perform any actions unless the team leader has told you to do so.  The team leader will typically poll for any strong objections before performing a large action. This is your time to raise any objections if you have them.  Once the team leader has made a decision, that decision is final and should be followed, even if you disagreed during the poll.  Answer any questions the team leader asks you in a clear and concise way.  Answering that you \"don't know\" something is perfectly acceptable. Do not try to guess.    The team leader may ask you to investigate something and get back to them in X minutes. Make sure you are ready with an answer within that time.  Answering that you need more time is perfectly acceptable, but you need to give the team leader an estimate of how much time.", 
+            "text": "The Team Leader (TL) is the leader of the incident response process, and is responsible for bringing the incident to resolution. They will announce themselves at the start of the call, and will generally be doing most of the talking.   TL is not available  A TL may not be available in which case the incident call will be guided by the senior Sysadmin or SME available.    Follow all instructions from the team leader, without exception.  Do not perform any actions unless the team leader has told you to do so.  The team leader will typically poll for any strong objections before performing a large action. This is your time to raise any objections if you have them.  Once the team leader has made a decision, that decision is final and should be followed, even if you disagreed during the poll.  Answer any questions the team leader asks you in a clear and concise way.  Answering that you \"don't know\" something is perfectly acceptable. Do not try to guess.    The team leader may ask you to investigate something and get back to them in X minutes. Make sure you are ready with an answer within that time.  Answering that you need more time is perfectly acceptable, but you need to give the team leader an estimate of how much time.", 
             "title": "The Team Leader"
         }, 
         {
@@ -282,37 +282,32 @@
         }, 
         {
             "location": "/before/call_etiquette/#theres-no-team-leader-on-the-call-i-dont-know-what-to-do", 
-            "text": "Ask on the call if an TL is present. If you have no response, try asking in Slack. If there is no TL the sysadmin can take over this role temporarily.", 
+            "text": "Ask on the call if an TL is present. If you have no response, try asking in our internal Chat. If there is no TL the sysadmin can take over this role temporarily.", 
             "title": "There's no team leader on the call! I don't know what to do!"
         }, 
         {
             "location": "/before/call_etiquette/#there-is-not-enough-information", 
-            "text": "The definitive source of information for all issues is in DoIT. If at any point there is a discrepancy ask the TL or Sysadmins to provide up to date information and update the card/tasks accordingly. At a minimum information should be available in Slack.", 
+            "text": "The definitive source of information for all issues is in DoIT. If it is lacking there then you need to make a note of it and make sure that whoever created the card understands the importance of complete information in a timely manner. If at any point there is a discrepancy ask the TL or Sysadmins to provide up to date information and update the card/tasks accordingly.", 
             "title": "There is not enough information!"
         }, 
-        {
-            "location": "/before/call_etiquette/#i-can-join-the-call-or-slack-but-not-both-what-should-i-do", 
-            "text": "You're welcome to join only one of the channels, however you should not actively participate in the incident response if so, as it causes disjoined communication. Liaise with someone who is both in Slack and on the call to provide any input you may have so that they can raise it.", 
-            "title": "I can join the call or Slack, but not both, what should I do?"
-        }, 
         {
             "location": "/during/during_an_incident/", 
-            "text": "Information on what to do during a major incident. See our \nseverity level descriptions\n for what constitutes a major incident.\n\n\n\n\nDocumentation\n\n\nAlways document your activities. Keep a detailed worklog of your actions in DoIT and communicate verbosely on Slack or other channels (email, etc.). \n\n\n\n  \n\n  \n\n  \n\n    \n\n      \n#support\n\n      \nhttp://response.spearhead.systems\n\n      \n+40728 005 263\n \n\n    \n\n    \n\n      \nNeed an TL? Do \n!tl page\n in Slack\n\n    \n\n    \n\n      \nFor executive summary updates only, join \n#executive-summary-updates\n.\n\n    \n\n  \n\n\n\n\n\n\n\n\nSecurity Incident?\n\n\nIf this is a security incident, you should follow the \nSecurity Incident Response\n process.\n\n\n\n\nDon't Panic!\n#\n\n\n\n\n\n\nJoin the incident call and chat (see links above).\n\n\n\n\nAnyone is free to join the call or chat to observe and follow along with the incident.\n\n\nIf you wish to participate however, you should join both. If you can't join the call for some reason, you should have a dedicated proxy for the call. Disjointed discussions in the chat room are ultimately distracting.\n\n\n\n\n\n\n\n\nFollow along with the call/chat, add any comments you feel are appropriate, but keep the discussion relevant to the problem at hand.\n\n\n\n\nIf you are not an SME, try to filter any discussion through the primary SME for your service. Too many people discussing at once get become overwhelming, so we should try to maintain a hierarchical structure to the call if possible.\n\n\n\n\n\n\n\n\nFollow instructions from the Team Leader.\n\n\n\n\nIs there no TL on the call?\n\n\nManually page them via Slack, with \n!tl page\n in Slack. This will page the primary and backup TL's at the same time.\n\n\nNever hesitate to page the TL. It's much better to have them and not need them than the other way around.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNot a call?\n\n\nNot all issues begin with a formal call. Some issues are self-explanatory and automatically generated via our monitoring platforms, a customer logging on to our portal, etc. In these scenarios \nDoIT\n is the definitive source. If that is not sufficient ask your TL.\n\n\n\n\nSteps for the Team Leader\n#\n\n\nResolve the incident as quickly and as safely as possible, use the Sysadmin to assist you. Delegate any tasks to relevant experts at your discretion.\n\n\n\n\n\n\nAnnounce on the call, in DoIT and in Slack that you are the team leader, who you have designated as sysadmin (usually the backup TL), and scribe/juniors if any.\n\n\n\n\n\n\nIdentify if there is an obvious cause to the incident (recent deployment, spike in traffic, etc.), delegate investigation to relevant experts,\n\n\n\n\nUse the service experts on the call to assist in the analysis. They should be able to quickly provide confirmation of the cause, but not always. It's the call of the TL on how to proceed in cases where the cause is not positively known. Confer with service owners and use their knowledge to help you.\n\n\n\n\n\n\n\n\nIdentify investigation \n repair actions (roll back, rate-limit services, etc) and delegate actions to relevant service experts. Typically something like this (obviously not an exhaustive list),\n\n\n\n\nBad Deployment:\n Roll it back.\n\n\nWeb Application Stuck/Crashed:\n Do a rolling restart.\n\n\nEvent Flood:\n Validate automatic throttling is sufficient, adjust manually if not.\n\n\nData Center Outage:\n Validate automation has removed bad data center. Force it to do so if not.\n\n\nDegraded Service Behavior without load:\n Gather forensic data (heap dumps, etc), and consider doing a rolling restart.\n\n\n\n\n\n\n\n\nListen for prompts from your Sysadmin regarding severity escalations, decide whether we need to announce publicly, and instruct customer liaison accordingly.\n\n\n\n\nAnnouncing publicly is at your discretion as TL. If you are unsure, then announce publicly (\"If in doubt, tweet it out\").\n\n\n\n\n\n\n\n\nOnce incident has recovered or is actively recovering, you can announce that the incident is over and that the call is ending. This usually indicates there's no more productive work to be done for the incident right now.\n\n\n\n\nMove the remaining, non-time-critical discussion to Slack.\n\n\nFollow up to ensure the customer liaison wraps up the incident publicly.\n\n\nIdentify any post-incident clean-up work.\n\n\nYou may need to perform debriefing/analysis of the underlying root cause.\n\n\n\n\n\n\n\n\n(After call ends) Create the post-mortem page from the template, and assign an owner to the post-mortem for the incident.\n\n\n\n\n\n\n(After call ends) Send out an internal email explaining that we had a major incident, provide a link to the post-mortem.\n\n\n\n\n\n\nSteps for Sysadmin\n#\n\n\nYou are there to support the TL in whatever they need.\n\n\n\n\n\n\nMonitor the status, and notify the TL if/when the incident escalates in severity level.\n\n\n\n\n\n\nBe prepared to page other people as directed by the Team Leader.\n\n\n\n\n\n\nProvide regular status updates in Slack (roughly every 30mins) to the executive team, giving an executive summary of the current status. Keep it short and to the point, and use @here.\n\n\n\n\n\n\nPerform any remediations, checking graphs, analysis or investigating logs unless otherwse delegated by the TL. \n\n\n\n\n\n\nFollow instructions from the Team Leader.\n\n\n\n\n\n\nSteps for Scribe\n#\n\n\nYou are there to document the key information from the incident in Slack, DoIT, our WiKi, etc.\n\n\n\n\n\n\nUpdate the apropriate channel with who the TL is, who the Sysadmin is, and that you're the scribe (if not already done).\n\n\n\n\ne.g. \"TL: Bob Boberson, Sysadmin: Gigi Con, Scribe: Writer Writerson\"\n\n\n\n\n\n\n\n\nYou should add notes to the proper channels when significant actions are taken, or findings are determined. You don't need to wait for the TL to direct this - use your own judgment.\n\n\n\n\nYou should also add \nTODO\n notes to the proper channel that indicate follow-ups slated for later.\n\n\n\n\n\n\n\n\nFollow instructions from the Team Leader.\n\n\n\n\n\n\nSteps for Subject Matter Experts\n#\n\n\nYou are there to support the team leader in identifying the cause of the incident, suggesting and evaluation repair actions, and following through on the repair actions.\n\n\n\n\n\n\nInvestigate the incident by analyzing any graphs or logs at your disposal. Announce all findings to the incident commander.\n\n\n\n\nIf you are unsure of the cause, that's fine, state that you are investigating and provide regular updates to the TL.\n\n\n\n\n\n\n\n\nAnnounce all suggestions for resolution to the team leader, it is their decision on how to proceed, do not follow any actions unless told to do so!\n\n\n\n\n\n\nFollow instructions from the team leader.\n\n\n\n\n\n\n(Optional) Once the call is over and post-mortem is created, add any notes you think are relevant to the post-mortem page.\n\n\n\n\n\n\nSteps for Customer Liaison\n#\n\n\nBe on stand-by to post public facing messages regarding the incident.\n\n\n\n\n\n\nYou will typically be required to update the status page and to send Tweets or other communications from our various accounts at certain times during the call.\n\n\n\n\n\n\nFollow instructions from the Team Leader.", 
+            "text": "Information on what to do during a major incident. See our \nseverity level descriptions\n for what constitutes a major incident.\n\n\n\n\nDocumentation\n\n\nAlways document your activities. Keep a detailed worklog of your actions in DoIT and communicate verbosely in our internal Chat or other channels (email, etc.). \n\n\n\n  \n\n  \n\n  \n\n    \n\n      \n#support\n (on MS Teams/internal Chat)\n\n      \nhttp://response.spearhead.systems\n\n      \n+40728 005 263\n \n\n    \n\n    \n\n      \nNeed an TL? Use a Sysadmin!\n\n    \n\n    \n\n      \nFor executive summary updates only, join \n#executive-summary-updates\n.\n\n    \n\n  \n\n\n\n\n\n\n\n\nSecurity Incident?\n\n\nIf this is a security incident, you should follow the \nSecurity Incident Response\n process.\n\n\n\n\nDon't Panic!\n#\n\n\n\n\n\n\nJoin the incident call and chat (see links above).\n\n\n\n\nAnyone is free to join the call or chat to observe and follow along with the incident.\n\n\nIf you wish to participate however, you should join both. If you can't join the call for some reason, you should have a dedicated proxy for the call. Disjointed discussions in the chat room are ultimately distracting.\n\n\n\n\n\n\n\n\nFollow along with the call/chat, add any comments you feel are appropriate, but keep the discussion relevant to the problem at hand.\n\n\n\n\nIf you are not an SME, try to filter any discussion through the primary SME for your service. Too many people discussing at once becomes overwhelming, so we try to maintain a hierarchical structure to the call if possible.\n\n\n\n\n\n\n\n\nFollow instructions from the Team Leader.\n\n\n\n\nIs there no TL on the call?\n\n\nCall them! \n\n\nNever hesitate to call the TL. It's much better to have them and not need them than the other way around.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNot a call?\n\n\nNot all issues begin with a formal call. Some issues are self-explanatory and automatically generated via our monitoring platforms, a customer logging on to our portal, etc. In these scenarios \nDoIT\n is the definitive source. If that is not sufficient ask your TL and Sysadmin.\n\n\n\n\nSteps for the Team Leader\n#\n\n\nResolve the incident as quickly and as safely as possible, use the Sysadmin to assist you. Delegate any tasks to relevant experts at your discretion.\n\n\n\n\n\n\nAnnounce on the call, in DoIT and in our internal Chat that you are the team leader, who you have designated as sysadmin (usually the backup TL), and scribe/juniors if any.\n\n\n\n\n\n\nIdentify if there is an obvious cause to the incident (recent deployment, spike in traffic, etc.), delegate investigation to relevant experts,\n\n\n\n\nUse the service experts on the call to assist in the analysis. They should be able to quickly provide confirmation of the cause, but not always. It's the call of the TL on how to proceed in cases where the cause is not positively known. Confer with service owners and use their knowledge to help you.\n\n\n\n\n\n\n\n\nIdentify investigation \n repair actions (roll back, rate-limit services, etc) and delegate actions to relevant service experts. Typically something like this (obviously not an exhaustive list),\n\n\n\n\nBad Deployment:\n Roll it back.\n\n\nWeb Application Stuck/Crashed:\n Do a rolling restart.\n\n\nEvent Flood:\n Validate automatic throttling is sufficient, adjust manually if not.\n\n\nData Center Outage:\n Validate automation has removed bad data center. Force it to do so if not.\n\n\nDegraded Service Behavior without load:\n Gather forensic data (heap dumps, etc), and consider doing a rolling restart.\n\n\n\n\n\n\n\n\nListen for prompts from your Sysadmin regarding severity escalations, decide whether we need to announce publicly, and instruct customer liaison accordingly.\n\n\n\n\nAnnouncing publicly is at your discretion as TL. If you are unsure, then announce publicly (\"If in doubt, tweet it out\").\n\n\n\n\n\n\n\n\nOnce incident has recovered or is actively recovering, you can announce that the incident is over and that the call is ending. This usually indicates there's no more productive work to be done for the incident right now.\n\n\n\n\nMove the remaining, non-time-critical discussion to our internal Chat.\n\n\nFollow up to ensure the customer liaison wraps up the incident publicly.\n\n\nIdentify any post-incident clean-up work.\n\n\nYou may need to perform debriefing/analysis of the underlying root cause.\n\n\n\n\n\n\n\n\n(After call ends) Create the post-mortem page from the template, and assign an owner to the post-mortem for the incident.\n\n\n\n\n\n\n(After call ends) Send out an internal email explaining that we had a major incident, provide a link to the post-mortem.\n\n\n\n\n\n\nSteps for Sysadmin\n#\n\n\nYou are there to support the TL in whatever they need.\n\n\n\n\n\n\nMonitor the status, and notify the TL if/when the incident escalates in severity level.\n\n\n\n\n\n\nBe prepared to page other people as directed by the Team Leader.\n\n\n\n\n\n\nProvide regular status updates in our internal Chat (roughly every 30mins) to the executive team, giving an executive summary of the current status. Keep it short and to the point, and use @\n.\n\n\n\n\n\n\nPerform any remediations, checking graphs, analysis or investigating logs unless otherwse delegated by the TL. \n\n\n\n\n\n\nFollow instructions from the Team Leader.\n\n\n\n\n\n\nSteps for Scribe\n#\n\n\nYou are there to document the key information from the incident in Slack, DoIT, our WiKi, etc.\n\n\n\n\n\n\nUpdate the apropriate channel with who the TL is, who the Sysadmin is, and that you're the scribe (if not already done).\n\n\n\n\ne.g. \"TL: Bob Boberson, Sysadmin: Gigi Con, Scribe: Writer Writerson\"\n\n\n\n\n\n\n\n\nYou should add notes to the proper channels when significant actions are taken, or findings are determined. You don't need to wait for the TL to direct this - use your own judgment.\n\n\n\n\nYou should also add \nTODO\n notes to the proper channel that indicate follow-ups slated for later.\n\n\n\n\n\n\n\n\nFollow instructions from the Team Leader.\n\n\n\n\n\n\nSteps for Subject Matter Experts\n#\n\n\nYou are there to support the team leader in identifying the cause of the incident, suggesting and evaluation repair actions, and following through on the repair actions.\n\n\n\n\n\n\nInvestigate the incident by analyzing any graphs or logs at your disposal. Announce all findings to the incident commander.\n\n\n\n\nIf you are unsure of the cause, that's fine, state that you are investigating and provide regular updates to the TL.\n\n\n\n\n\n\n\n\nAnnounce all suggestions for resolution to the team leader, it is their decision on how to proceed, do not follow any actions unless told to do so!\n\n\n\n\n\n\nFollow instructions from the team leader.\n\n\n\n\n\n\n(Optional) Once the call is over and post-mortem is created, add any notes you think are relevant to the post-mortem page.\n\n\n\n\n\n\nSteps for Customer Liaison\n#\n\n\nBe on stand-by to post public facing messages regarding the incident.\n\n\n\n\n\n\nYou will typically be required to update the status page and to send Tweets or other communications from our various accounts at certain times during the call.\n\n\n\n\n\n\nFollow instructions from the Team Leader.", 
             "title": "During An Incident"
         }, 
         {
             "location": "/during/during_an_incident/#dont-panic", 
-            "text": "Join the incident call and chat (see links above).   Anyone is free to join the call or chat to observe and follow along with the incident.  If you wish to participate however, you should join both. If you can't join the call for some reason, you should have a dedicated proxy for the call. Disjointed discussions in the chat room are ultimately distracting.     Follow along with the call/chat, add any comments you feel are appropriate, but keep the discussion relevant to the problem at hand.   If you are not an SME, try to filter any discussion through the primary SME for your service. Too many people discussing at once get become overwhelming, so we should try to maintain a hierarchical structure to the call if possible.     Follow instructions from the Team Leader.   Is there no TL on the call?  Manually page them via Slack, with  !tl page  in Slack. This will page the primary and backup TL's at the same time.  Never hesitate to page the TL. It's much better to have them and not need them than the other way around.        Not a call?  Not all issues begin with a formal call. Some issues are self-explanatory and automatically generated via our monitoring platforms, a customer logging on to our portal, etc. In these scenarios  DoIT  is the definitive source. If that is not sufficient ask your TL.", 
+            "text": "Join the incident call and chat (see links above).   Anyone is free to join the call or chat to observe and follow along with the incident.  If you wish to participate however, you should join both. If you can't join the call for some reason, you should have a dedicated proxy for the call. Disjointed discussions in the chat room are ultimately distracting.     Follow along with the call/chat, add any comments you feel are appropriate, but keep the discussion relevant to the problem at hand.   If you are not an SME, try to filter any discussion through the primary SME for your service. Too many people discussing at once becomes overwhelming, so we try to maintain a hierarchical structure to the call if possible.     Follow instructions from the Team Leader.   Is there no TL on the call?  Call them!   Never hesitate to call the TL. It's much better to have them and not need them than the other way around.        Not a call?  Not all issues begin with a formal call. Some issues are self-explanatory and automatically generated via our monitoring platforms, a customer logging on to our portal, etc. In these scenarios  DoIT  is the definitive source. If that is not sufficient ask your TL and Sysadmin.", 
             "title": "Don't Panic!"
         }, 
         {
             "location": "/during/during_an_incident/#steps-for-the-team-leader", 
-            "text": "Resolve the incident as quickly and as safely as possible, use the Sysadmin to assist you. Delegate any tasks to relevant experts at your discretion.    Announce on the call, in DoIT and in Slack that you are the team leader, who you have designated as sysadmin (usually the backup TL), and scribe/juniors if any.    Identify if there is an obvious cause to the incident (recent deployment, spike in traffic, etc.), delegate investigation to relevant experts,   Use the service experts on the call to assist in the analysis. They should be able to quickly provide confirmation of the cause, but not always. It's the call of the TL on how to proceed in cases where the cause is not positively known. Confer with service owners and use their knowledge to help you.     Identify investigation   repair actions (roll back, rate-limit services, etc) and delegate actions to relevant service experts. Typically something like this (obviously not an exhaustive list),   Bad Deployment:  Roll it back.  Web Application Stuck/Crashed:  Do a rolling restart.  Event Flood:  Validate automatic throttling is sufficient, adjust manually if not.  Data Center Outage:  Validate automation has removed bad data center. Force it to do so if not.  Degraded Service Behavior without load:  Gather forensic data (heap dumps, etc), and consider doing a rolling restart.     Listen for prompts from your Sysadmin regarding severity escalations, decide whether we need to announce publicly, and instruct customer liaison accordingly.   Announcing publicly is at your discretion as TL. If you are unsure, then announce publicly (\"If in doubt, tweet it out\").     Once incident has recovered or is actively recovering, you can announce that the incident is over and that the call is ending. This usually indicates there's no more productive work to be done for the incident right now.   Move the remaining, non-time-critical discussion to Slack.  Follow up to ensure the customer liaison wraps up the incident publicly.  Identify any post-incident clean-up work.  You may need to perform debriefing/analysis of the underlying root cause.     (After call ends) Create the post-mortem page from the template, and assign an owner to the post-mortem for the incident.    (After call ends) Send out an internal email explaining that we had a major incident, provide a link to the post-mortem.", 
+            "text": "Resolve the incident as quickly and as safely as possible, use the Sysadmin to assist you. Delegate any tasks to relevant experts at your discretion.    Announce on the call, in DoIT and in our internal Chat that you are the team leader, who you have designated as sysadmin (usually the backup TL), and scribe/juniors if any.    Identify if there is an obvious cause to the incident (recent deployment, spike in traffic, etc.), delegate investigation to relevant experts,   Use the service experts on the call to assist in the analysis. They should be able to quickly provide confirmation of the cause, but not always. It's the call of the TL on how to proceed in cases where the cause is not positively known. Confer with service owners and use their knowledge to help you.     Identify investigation   repair actions (roll back, rate-limit services, etc) and delegate actions to relevant service experts. Typically something like this (obviously not an exhaustive list),   Bad Deployment:  Roll it back.  Web Application Stuck/Crashed:  Do a rolling restart.  Event Flood:  Validate automatic throttling is sufficient, adjust manually if not.  Data Center Outage:  Validate automation has removed bad data center. Force it to do so if not.  Degraded Service Behavior without load:  Gather forensic data (heap dumps, etc), and consider doing a rolling restart.     Listen for prompts from your Sysadmin regarding severity escalations, decide whether we need to announce publicly, and instruct customer liaison accordingly.   Announcing publicly is at your discretion as TL. If you are unsure, then announce publicly (\"If in doubt, tweet it out\").     Once incident has recovered or is actively recovering, you can announce that the incident is over and that the call is ending. This usually indicates there's no more productive work to be done for the incident right now.   Move the remaining, non-time-critical discussion to our internal Chat.  Follow up to ensure the customer liaison wraps up the incident publicly.  Identify any post-incident clean-up work.  You may need to perform debriefing/analysis of the underlying root cause.     (After call ends) Create the post-mortem page from the template, and assign an owner to the post-mortem for the incident.    (After call ends) Send out an internal email explaining that we had a major incident, provide a link to the post-mortem.", 
             "title": "Steps for the Team Leader"
         }, 
         {
             "location": "/during/during_an_incident/#steps-for-sysadmin", 
-            "text": "You are there to support the TL in whatever they need.    Monitor the status, and notify the TL if/when the incident escalates in severity level.    Be prepared to page other people as directed by the Team Leader.    Provide regular status updates in Slack (roughly every 30mins) to the executive team, giving an executive summary of the current status. Keep it short and to the point, and use @here.    Perform any remediations, checking graphs, analysis or investigating logs unless otherwse delegated by the TL.     Follow instructions from the Team Leader.", 
+            "text": "You are there to support the TL in whatever they need.    Monitor the status, and notify the TL if/when the incident escalates in severity level.    Be prepared to page other people as directed by the Team Leader.    Provide regular status updates in our internal Chat (roughly every 30mins) to the executive team, giving an executive summary of the current status. Keep it short and to the point, and use @ .    Perform any remediations, checking graphs, analysis or investigating logs unless otherwse delegated by the TL.     Follow instructions from the Team Leader.", 
             "title": "Steps for Sysadmin"
         }, 
         {
@@ -332,7 +327,7 @@
         }, 
         {
             "location": "/during/security_incident_response/", 
-            "text": "Team Leader Required\n\n\nAs with all major incidents, security ones will also involve a Team Leader, who will delegate the tasks to relevant resolvers. Tasks may be performed in parallel as assigned by the TL. Contact one at the earliest possible opportunity.\n\n\n\n\nChecklist\n#\n\n\nDetails for each of these items are available in the next section.\n\n\n\n\nStop the attack in progress.\n\n\nCut off the attack vector.\n\n\nAssemble the response team.\n\n\nIsolate affected instances.\n\n\nIdentify timeline of attack.\n\n\nIdentify compromised data.\n\n\nAssess risk to other systems.\n\n\nAssess risk of re-attack.\n\n\nApply additional mitigations, additions to monitoring, etc.\n\n\nForensic analysis of compromised systems.\n\n\nInternal communication.\n\n\nInvolve law enforcement.\n\n\nReach out to external parties that may have been used as vector for attack.\n\n\nExternal communication.\n\n\n\n\n\n\nAttack Mitigation\n#\n\n\nStop the attack as quickly as you can, via any means necessary. Shut down servers, network isolate them, turn off a data center if you have to. Some common things to try,\n\n\n\n\nShutdown the instance from the provider console (do not delete or terminate if you can help it, as we'll need to do forensics).\n\n\nIf you happen to be logged into the box you can try to,\n\n\nRe-instate our default iptables rules to restrict traffic.\n\n\nkill -9\n any active session you think is an attacker.\n\n\nChange root password, and update /etc/shadow to lock out all other users.\n\n\nsudo shutdown now\n\n\n\n\n\n\n\n\nCut Off Attack Vector\n#\n\n\nIdentify the likely attack vectors and path/fix them so they cannot be re-exploited immediately after stopping the attack.\n\n\n\n\nIf you suspect a third-party provider is compromised, delete all accounts except your own (and those of others who are physically present) and immediately rotate your password and MFA tokens.\n\n\nIf you suspect a service application was an attack vector, disable any relevant code paths, or shut down the service entirely.\n\n\n\n\nAssemble Response Team\n#\n\n\nIdentify the key responders for the security incident, and keep them all in the loop. Set up a secure method of communicating all information associated with the incident. Details on the incident (or even the fact that an incident has occurred) should be kept private to the responders until you are confident the attack is not being triggered internally.\n\n\n\n\nThe security and site-reliability teams should usually be involved.\n\n\nA representative for any affected services should be involved.\n\n\nA Team Leader (TL) should be appointed, who will also appoint the usual incident command roles. The incident command team will be responsible for keeping documentation of actions taken, and for notifying internal stakeholders as appropriate.\n\n\nDo not communicate with anyone not on the response team about the incident until forensics has been performed. The attack could be happening internally.\n\n\nGive the project an innocuous codename that can be used for chats/documents so if anyone overhears they don't realize it's a security incident. (e.g. sapphire-unicorn).\n\n\nPrefix all emails, and chat topics with \"Attorney Work Project\".\n\n\n\n\nIsolate Affected Instances\n#\n\n\nAny instances which were affected by the attack should be immediately isolated from any other instances. As soon as possible, an image of the system should be taken and put into a read-only cold storage for later forensic analysis.\n\n\n\n\nBlacklist the IP addresses for any affected instances from all other hosts.\n\n\nTurn off and shutdown the instances immediately if you didn't do that to stop the attack.\n\n\nTake a disk image for any disks attached to the instances, and ship them to an off-site cold storage location. You should make sure these images are read-only and cannot be tampered with.\n\n\n\n\nIdentify Timeline of Attack\n#\n\n\nWork with all tools at your disposal to identify the timeline of the attack, along with exactly what the attacker did.\n\n\n\n\nAny reconnaissance the attacker performed on the system before the attack started.\n\n\nWhen the attacker gained access to the system.\n\n\nWhat actions the attacker performed on the system, and when.\n\n\nIdentify how long the attacker had access to the system before they were detected, and before they were kicked out.\n\n\nIdentify any queries the attacker ran on databases.\n\n\nTry to identify if the attacker still has access to the system via another back door. Monitor logs for unusual activity, etc.\n\n\n\n\nCompromised Data\n#\n\n\nUsing forensic analysis of log files, time-series graphs, and any other information/tools at your disposal, attempt to identify what information was compromised (if any),\n\n\n\n\nIdentify any data that was compromised during the attack.\n\n\nWas any data exfiltrated from a database?\n\n\nWhat keys were on the system that are now considering compromised?\n\n\nWas the attacker able to identify other components of the system (map out the network, etc).\n\n\n\n\n\n\nFind exactly what customer data has been compromised, if any.\n\n\n\n\nAssess Risk\n#\n\n\nBased on the data that was compromised, assess the risk to other systems.\n\n\n\n\nDoes the attacker have enough information to find another way in?\n\n\nWere any passwords or keys stored on the host? If so, they should be considered compromised, regardless of how they were stored.\n\n\nAny user accounts that were used in the initial attack should rotate all of their keys and passwords on every other system they have an account.\n\n\n\n\nApply Additional Mitigations\n#\n\n\nStart applying mitigations to other parts of your system.\n\n\n\n\nRotate any compromised data.\n\n\nIdentify any new alerting which is needed to notify of a similar breach.\n\n\nBlock any IP addresses associated with the attack.\n\n\nIdentify any keys/credentials that are compromised and revoke their access immediately.\n\n\n\n\nForensic Analysis\n#\n\n\nOnce you are confident the systems are secured, and enough monitoring is in place to detect another attack, you can move onto the forensic analysis stage.\n\n\n\n\nTake any read-only images you created, any access logs you have, and comb through them for more information about the attack.\n\n\nIdentify exactly what happened, how it happened, and how to prevent it in future.\n\n\nKeep track of all IP addresses involved in the attack.\n\n\nMonitor logs for any attempt to regain access to the system by the attacker.\n\n\n\n\nInternal Communication\n#\n\n\nDelegate to:\n VP or Director of Engineering\n\n\nCommunicate internally only once you are confident (via forensic analysis) that the attack was not sourced internally.\n\n\n\n\nDon't go into too much detail.\n\n\nOverview the timeline.\n\n\nDiscuss mitigation steps taken.\n\n\nFollow up with more information once it is known.\n\n\n\n\nLiaise With Law Enforcement / External Actors\n#\n\n\nDelegate to:\n VP or Director of Engineering\n\n\nWork with law enforcement to identify the source of the attack, letting any system owners know that systems under their control may be compromised, etc.\n\n\n\n\nContact local law enforcement.\n\n\nContact FBI.\n\n\nContact operators for any systems used in the attack, their systems may also have been compromised.\n\n\nContact security companies to help in assessing risk and any PR next steps.\n\n\n\n\nExternal Communication\n#\n\n\nDelegate to:\n TL, Marketing Team\n\n\nOnce you have validated all of the information you have is accurate, have a timeline of events, and know exactly what information was compromised, how it was compromised, and sure that it won't happen again. Only then should you prepare and release a public statement to customers informing them of the compromised information and any steps they need to take.\n\n\n\n\nInclude the date in the title of any announcement, so that it's never confused for a potential new breach.\n\n\nDon't say \"We take security very seriously\". It makes everyone cringe when they read it.\n\n\nBe honest, accept responsibility, and present the facts, along with exactly how we plan to prevent such things in future.\n\n\nBe as detailed as possible with the timeline.\n\n\nBe as detailed as possible in what information was compromised, and how it affects customers. If we were storing something we shouldn't have been, be honest about it. It'll come out later and it'll be much worse.\n\n\nDon't name and shame any external parties that might have caused the compromise. It's bad form. (Unless they've already publicly disclosed, in which case we can link to their disclosure).\n\n\nRelease the external communication as soon as possible, preferably within a few days of the compromise. The longer we wait, the worse it will be.\n\n\nFigure out if there is a way to get in touch with customers' internal security teams before the general public notice is sent.\n\n\n\n\n\n\nAdditional Reading\n#\n\n\n\n\nComputer Security Incident Handling Guide\n (NIST)\n\n\nIncident Handler's Handbook\n (SANS)\n\n\nResponding to IT Security Incidents\n (Microsoft)\n\n\nDefining Incident Management Processes for CSIRTs: A Work in Progress\n (CMU)\n\n\nCreating and Managing Computer Security Incident Handling Teams (CSIRTS)\n (CERT)\n\n\nGoogle Infrastructure Security Design Overview\n (Google)", 
+            "text": "Team Leader Required\n\n\nAs with all major incidents, security ones will also involve a Team Leader, who will delegate the tasks to relevant resolvers. Tasks may be performed in parallel as assigned by the TL. Contact one at the earliest possible opportunity.\n\n\n\n\nChecklist\n#\n\n\nDetails for each of these items are available in the next section.\n\n\n\n\nStop the attack in progress.\n\n\nCut off the attack vector.\n\n\nAssemble the response team.\n\n\nIsolate affected instances.\n\n\nIdentify timeline of attack.\n\n\nIdentify compromised data.\n\n\nAssess risk to other systems.\n\n\nAssess risk of re-attack.\n\n\nApply additional mitigations, additions to monitoring, etc.\n\n\nForensic analysis of compromised systems.\n\n\nInternal communication.\n\n\nInvolve law enforcement.\n\n\nReach out to external parties that may have been used as vector for attack.\n\n\nExternal communication.\n\n\n\n\n\n\nAttack Mitigation\n#\n\n\nStop the attack as quickly as you can, via any means necessary. Shut down servers, network isolate them, turn off a data center if you have to. Some common things to try,\n\n\n\n\nShutdown the instance from the provider console (do not delete or terminate if you can help it, as we'll need to do forensics).\n\n\nIf you happen to be logged into the box you can try to,\n\n\nApply firewall rules to restrict traffic.\n\n\nkill -9\n any active session you think is an attacker.\n\n\nChange root password, and update /etc/shadow to lock out all other users.\n\n\nsudo shutdown now\n\n\n\n\n\n\n\n\nCut Off Attack Vector\n#\n\n\nIdentify the likely attack vectors and path/fix them so they cannot be re-exploited immediately after stopping the attack.\n\n\n\n\nIf you suspect a third-party provider is compromised, delete all accounts except your own (and those of others who are physically present) and immediately rotate your password and MFA tokens.\n\n\nDisable/remove ssh keys that do not belong to you and those of others who are physically present.\n\n\nIf you suspect a service application was an attack vector, disable any relevant code paths, or shut down the service entirely.\n\n\n\n\nAssemble Response Team\n#\n\n\nIdentify the key responders for the security incident, and keep them all in the loop. Set up a secure method of communicating all information associated with the incident (internal Chat is one option). Details on the incident (or even the fact that an incident has occurred) should be kept private to the responders until you are confident the attack is not being triggered internally.\n\n\n\n\nThe security and site-reliability teams should usually be involved.\n\n\nA representative for any affected services should be involved.\n\n\nA Team Leader (TL) should be appointed, who will also appoint the usual incident command roles. The incident command team will be responsible for keeping documentation of actions taken, and for notifying internal stakeholders as appropriate.\n\n\nDo not communicate with anyone not on the response team about the incident until forensics has been performed. The attack could be happening internally.\n\n\nGive the project an innocuous codename that can be used for chats/documents so if anyone overhears they don't realize it's a security incident. (e.g. sapphire-unicorn).\n\n\nPrefix all emails, and chat topics with \"Legal Work Project\".\n\n\n\n\nIsolate Affected Instances\n#\n\n\nAny instances which were affected by the attack should be immediately isolated from any other instances. As soon as possible, an image of the system should be taken and put into a read-only cold storage for later forensic analysis.\n\n\n\n\nBlacklist the IP addresses for any affected instances from all other hosts.\n\n\nTurn off and shutdown the instances immediately if you didn't do that to stop the attack.\n\n\nTake a disk image for any disks attached to the instances, and ship them to an off-site cold storage location. You should make sure these images are read-only and cannot be tampered with.\n\n\n\n\nIdentify Timeline of Attack\n#\n\n\nWork with all tools at your disposal to identify the timeline of the attack, along with exactly what the attacker did.\n\n\n\n\nAny reconnaissance the attacker performed on the system before the attack started.\n\n\nWhen the attacker gained access to the system.\n\n\nWhat actions the attacker performed on the system, and when.\n\n\nIdentify how long the attacker had access to the system before they were detected, and before they were kicked out.\n\n\nIdentify any queries the attacker ran on databases.\n\n\nTry to identify if the attacker still has access to the system via another back door. Monitor logs for unusual activity, etc.\n\n\n\n\nCompromised Data\n#\n\n\nUsing forensic analysis of log files, time-series graphs, and any other information/tools at your disposal, attempt to identify what information was compromised (if any),\n\n\n\n\nIdentify any data that was compromised during the attack.\n\n\nWas any data exfiltrated from a database?\n\n\nWhat keys were on the system that are now considering compromised?\n\n\nWas the attacker able to identify other components of the system (map out the network, etc).\n\n\n\n\n\n\nFind exactly what customer data has been compromised, if any.\n\n\n\n\nAssess Risk\n#\n\n\nBased on the data that was compromised, assess the risk to other systems.\n\n\n\n\nDoes the attacker have enough information to find another way in?\n\n\nWere any passwords or keys stored on the host? If so, they should be considered compromised, regardless of how they were stored.\n\n\nAny user accounts that were used in the initial attack should rotate all of their keys and passwords on every other system they have an account.\n\n\n\n\nApply Additional Mitigations\n#\n\n\nStart applying mitigations to other parts of your system.\n\n\n\n\nRotate any compromised data.\n\n\nIdentify any new alerting which is needed to notify of a similar breach.\n\n\nBlock any IP addresses associated with the attack.\n\n\nIdentify any keys/credentials that are compromised and revoke their access immediately.\n\n\n\n\nForensic Analysis\n#\n\n\nOnce you are confident the systems are secured, and enough monitoring is in place to detect another attack, you can move onto the forensic analysis stage.\n\n\n\n\nTake any read-only images you created, any access logs you have, and comb through them for more information about the attack.\n\n\nIdentify exactly what happened, how it happened, and how to prevent it in future.\n\n\nKeep track of all IP addresses involved in the attack.\n\n\nMonitor logs for any attempt to regain access to the system by the attacker.\n\n\n\n\nInternal Communication\n#\n\n\nDelegate to:\n CTO, GM\n\n\nCommunicate internally only once you are confident (via forensic analysis) that the attack was not sourced internally.\n\n\n\n\nDon't go into too much detail.\n\n\nOverview the timeline.\n\n\nDiscuss mitigation steps taken.\n\n\nFollow up with more information once it is known.\n\n\n\n\nLiaise With Law Enforcement / External Actors\n#\n\n\nDelegate to:\n CTO, GM\n\n\nWork with law enforcement to identify the source of the attack, letting any system owners know that systems under their control may be compromised, etc.\n\n\n\n\nContact local law enforcement.\n\n\nContact FBI.\n\n\nContact operators for any systems used in the attack, their systems may also have been compromised.\n\n\nContact security companies to help in assessing risk and any PR next steps.\n\n\n\n\nExternal Communication\n#\n\n\nDelegate to:\n TL, PR/Marketing\n\n\nOnce you have validated all of the information you have is accurate, have a timeline of events, and know exactly what information was compromised, how it was compromised, and sure that it won't happen again. Only then should you prepare and release a public statement to customers informing them of the compromised information and any steps they need to take.\n\n\n\n\nInclude the date in the title of any announcement, so that it's never confused for a potential new breach.\n\n\nDon't say \"We take security very seriously\". It makes everyone cringe when they read it.\n\n\nBe honest, accept responsibility, and present the facts, along with exactly how we plan to prevent such things in future.\n\n\nBe as detailed as possible with the timeline.\n\n\nBe as detailed as possible in what information was compromised, and how it affects customers. If we were storing something we shouldn't have been, be honest about it. It'll come out later and it'll be much worse.\n\n\nDon't name and shame any external parties that might have caused the compromise. It's bad form. (Unless they've already publicly disclosed, in which case we can link to their disclosure).\n\n\nRelease the external communication as soon as possible, preferably within a few days of the compromise. The longer we wait, the worse it will be.\n\n\nFigure out if there is a way to get in touch with customers' internal security teams before the general public notice is sent.\n\n\n\n\n\n\nAdditional Reading\n#\n\n\n\n\nComputer Security Incident Handling Guide\n (NIST)\n\n\nIncident Handler's Handbook\n (SANS)\n\n\nResponding to IT Security Incidents\n (Microsoft)\n\n\nDefining Incident Management Processes for CSIRTs: A Work in Progress\n (CMU)\n\n\nCreating and Managing Computer Security Incident Handling Teams (CSIRTS)\n (CERT)\n\n\nGoogle Infrastructure Security Design Overview\n (Google)", 
             "title": "Security Incident"
         }, 
         {
@@ -342,17 +337,17 @@
         }, 
         {
             "location": "/during/security_incident_response/#attack-mitigation", 
-            "text": "Stop the attack as quickly as you can, via any means necessary. Shut down servers, network isolate them, turn off a data center if you have to. Some common things to try,   Shutdown the instance from the provider console (do not delete or terminate if you can help it, as we'll need to do forensics).  If you happen to be logged into the box you can try to,  Re-instate our default iptables rules to restrict traffic.  kill -9  any active session you think is an attacker.  Change root password, and update /etc/shadow to lock out all other users.  sudo shutdown now", 
+            "text": "Stop the attack as quickly as you can, via any means necessary. Shut down servers, network isolate them, turn off a data center if you have to. Some common things to try,   Shutdown the instance from the provider console (do not delete or terminate if you can help it, as we'll need to do forensics).  If you happen to be logged into the box you can try to,  Apply firewall rules to restrict traffic.  kill -9  any active session you think is an attacker.  Change root password, and update /etc/shadow to lock out all other users.  sudo shutdown now", 
             "title": "Attack Mitigation"
         }, 
         {
             "location": "/during/security_incident_response/#cut-off-attack-vector", 
-            "text": "Identify the likely attack vectors and path/fix them so they cannot be re-exploited immediately after stopping the attack.   If you suspect a third-party provider is compromised, delete all accounts except your own (and those of others who are physically present) and immediately rotate your password and MFA tokens.  If you suspect a service application was an attack vector, disable any relevant code paths, or shut down the service entirely.", 
+            "text": "Identify the likely attack vectors and path/fix them so they cannot be re-exploited immediately after stopping the attack.   If you suspect a third-party provider is compromised, delete all accounts except your own (and those of others who are physically present) and immediately rotate your password and MFA tokens.  Disable/remove ssh keys that do not belong to you and those of others who are physically present.  If you suspect a service application was an attack vector, disable any relevant code paths, or shut down the service entirely.", 
             "title": "Cut Off Attack Vector"
         }, 
         {
             "location": "/during/security_incident_response/#assemble-response-team", 
-            "text": "Identify the key responders for the security incident, and keep them all in the loop. Set up a secure method of communicating all information associated with the incident. Details on the incident (or even the fact that an incident has occurred) should be kept private to the responders until you are confident the attack is not being triggered internally.   The security and site-reliability teams should usually be involved.  A representative for any affected services should be involved.  A Team Leader (TL) should be appointed, who will also appoint the usual incident command roles. The incident command team will be responsible for keeping documentation of actions taken, and for notifying internal stakeholders as appropriate.  Do not communicate with anyone not on the response team about the incident until forensics has been performed. The attack could be happening internally.  Give the project an innocuous codename that can be used for chats/documents so if anyone overhears they don't realize it's a security incident. (e.g. sapphire-unicorn).  Prefix all emails, and chat topics with \"Attorney Work Project\".", 
+            "text": "Identify the key responders for the security incident, and keep them all in the loop. Set up a secure method of communicating all information associated with the incident (internal Chat is one option). Details on the incident (or even the fact that an incident has occurred) should be kept private to the responders until you are confident the attack is not being triggered internally.   The security and site-reliability teams should usually be involved.  A representative for any affected services should be involved.  A Team Leader (TL) should be appointed, who will also appoint the usual incident command roles. The incident command team will be responsible for keeping documentation of actions taken, and for notifying internal stakeholders as appropriate.  Do not communicate with anyone not on the response team about the incident until forensics has been performed. The attack could be happening internally.  Give the project an innocuous codename that can be used for chats/documents so if anyone overhears they don't realize it's a security incident. (e.g. sapphire-unicorn).  Prefix all emails, and chat topics with \"Legal Work Project\".", 
             "title": "Assemble Response Team"
         }, 
         {
@@ -387,17 +382,17 @@
         }, 
         {
             "location": "/during/security_incident_response/#internal-communication", 
-            "text": "Delegate to:  VP or Director of Engineering  Communicate internally only once you are confident (via forensic analysis) that the attack was not sourced internally.   Don't go into too much detail.  Overview the timeline.  Discuss mitigation steps taken.  Follow up with more information once it is known.", 
+            "text": "Delegate to:  CTO, GM  Communicate internally only once you are confident (via forensic analysis) that the attack was not sourced internally.   Don't go into too much detail.  Overview the timeline.  Discuss mitigation steps taken.  Follow up with more information once it is known.", 
             "title": "Internal Communication"
         }, 
         {
             "location": "/during/security_incident_response/#liaise-with-law-enforcement-external-actors", 
-            "text": "Delegate to:  VP or Director of Engineering  Work with law enforcement to identify the source of the attack, letting any system owners know that systems under their control may be compromised, etc.   Contact local law enforcement.  Contact FBI.  Contact operators for any systems used in the attack, their systems may also have been compromised.  Contact security companies to help in assessing risk and any PR next steps.", 
+            "text": "Delegate to:  CTO, GM  Work with law enforcement to identify the source of the attack, letting any system owners know that systems under their control may be compromised, etc.   Contact local law enforcement.  Contact FBI.  Contact operators for any systems used in the attack, their systems may also have been compromised.  Contact security companies to help in assessing risk and any PR next steps.", 
             "title": "Liaise With Law Enforcement / External Actors"
         }, 
         {
             "location": "/during/security_incident_response/#external-communication", 
-            "text": "Delegate to:  TL, Marketing Team  Once you have validated all of the information you have is accurate, have a timeline of events, and know exactly what information was compromised, how it was compromised, and sure that it won't happen again. Only then should you prepare and release a public statement to customers informing them of the compromised information and any steps they need to take.   Include the date in the title of any announcement, so that it's never confused for a potential new breach.  Don't say \"We take security very seriously\". It makes everyone cringe when they read it.  Be honest, accept responsibility, and present the facts, along with exactly how we plan to prevent such things in future.  Be as detailed as possible with the timeline.  Be as detailed as possible in what information was compromised, and how it affects customers. If we were storing something we shouldn't have been, be honest about it. It'll come out later and it'll be much worse.  Don't name and shame any external parties that might have caused the compromise. It's bad form. (Unless they've already publicly disclosed, in which case we can link to their disclosure).  Release the external communication as soon as possible, preferably within a few days of the compromise. The longer we wait, the worse it will be.  Figure out if there is a way to get in touch with customers' internal security teams before the general public notice is sent.", 
+            "text": "Delegate to:  TL, PR/Marketing  Once you have validated all of the information you have is accurate, have a timeline of events, and know exactly what information was compromised, how it was compromised, and sure that it won't happen again. Only then should you prepare and release a public statement to customers informing them of the compromised information and any steps they need to take.   Include the date in the title of any announcement, so that it's never confused for a potential new breach.  Don't say \"We take security very seriously\". It makes everyone cringe when they read it.  Be honest, accept responsibility, and present the facts, along with exactly how we plan to prevent such things in future.  Be as detailed as possible with the timeline.  Be as detailed as possible in what information was compromised, and how it affects customers. If we were storing something we shouldn't have been, be honest about it. It'll come out later and it'll be much worse.  Don't name and shame any external parties that might have caused the compromise. It's bad form. (Unless they've already publicly disclosed, in which case we can link to their disclosure).  Release the external communication as soon as possible, preferably within a few days of the compromise. The longer we wait, the worse it will be.  Figure out if there is a way to get in touch with customers' internal security teams before the general public notice is sent.", 
             "title": "External Communication"
         }, 
         {
@@ -407,12 +402,12 @@
         }, 
         {
             "location": "/after/post_mortem_process/", 
-            "text": "For every major issue (SR/IN +major), we need to follow up with a post-mortem. A blame-free, detailed description, of exactly what went wrong in order to cause the incident, along with a list of steps to take in order to prevent a similar incident from occurring again in the future. The incident response process itself should also be included.\n\n\n\n\nOwner Designation\n#\n\n\nThe first step is that a post-mortem owner will be designated. This is done by the TL either at the end of a major incident call, or very shortly after. You will be notified directly by the TL if you are the owner for the post-mortem. The owner is responsible for populating the post-mortem page, looking up logs, managing the followup investigation, and keeping all interested parties in the loop. Please use DoIT and Slack for coordinating followup. A detailed list of the steps is available below,\n\n\nOwner Responsibilities\n#\n\n\nAs owner of a post-mortem, you are responsible for the following,\n\n\n\n\nScheduling the post-mortem meeting (on a shared calendar) and inviting the relevant people (this should be scheduled within 5 business days of the incident).\n\n\nUpdating the page with all of the necessary content.\n\n\nInvestigating the incident, pulling in whomever you need from other teams to assist in the investigation.\n\n\nCreating follow-up DoIT cards (\nYou are only responsible for creating the cards, not following them up to resolution\n).\n\n\nRunning the post-mortem meeting (\nthese generally run themselves, but you should get people back on topic if the conversation starts to wander\n).\n\n\nIn cases where we need a public blog post, creating \n reviewing it with appropriate parties.\n\n\n\n\nPost-Mortem Wiki Page\n#\n\n\nOnce you've been designated as the owner of a post-mortem, you should start updating the page with all the relevant information.\n\n\n\n\n\n\n(If not already done by the TL) Create a new post-mortem page for the incident.\n\n\n\n\n\n\nSchedule a post-mortem meeting for within 5 business days of the incident. You should schedule this before filling in the page, just so it's on the calendar.\n\n\n\n\nCreate the meeting on the \"Incident Post-Mortem Meetings\" shared calendar.\n\n\n\n\n\n\n\n\nBegin populating the page with all of the information you have.\n\n\n\n\nThe timeline should be the main focus to begin with.\n\n\nThe timeline should include important changes in status/impact, and also key actions taken by responders.\n\n\nYou should mark the start of the incident in red, and the resolution in green (for when we went into/out of SR/IN +major).\n\n\n\n\n\n\nGo through the history in DoIT and Slack to identify the responders, and add them to the page.\n\n\nIdentify the Team Leader and Scribe in this list.\n\n\n\n\n\n\n\n\n\n\n\n\nPopulate the page with more detailed information.\n\n\n\n\nFor each item in the timeline, identify a metric, or some third-party page where the data came from. This could be a link to a Check_MK graph, a logwatch search, a Tweet, etc. Anything which shows the data point you're trying to illustrate in the timeline.\n\n\n\n\n\n\n\n\nPerform an analysis of the incident.\n\n\n\n\nCapture all available data regarding the incident. What caused it, how many customers were affected, etc.\n\n\nAny commands or queries you use to look up data should be posted in the page so others can see how the data was gathered.\n\n\nCapture the impact to customers (generally in terms of event submission, delayed processing, and slow notification delivery)\n\n\nIdentify the underlying cause of the incident (What happened, and why did it happen).\n\n\n\n\n\n\n\n\nCreate any followup action DoIT cards (or note down topics for discussion if we need to decide on a direction to go before creating tickets),\n\n\n\n\nGo through the history in DoIT, Slack to identify any TODO items.\n\n\nLabel all tickets with their severity level and date tags.\n\n\nAny actions which can reduce re-occurrence of the incident.\n\n\n(There may be some trade-off here, and that's fine. Sometimes the ROI isn't worth the effort that would go into it).\n\n\n\n\n\n\nIdentify any actions which can make our incident response process better.\n\n\nBe careful with creating too many cards. Generally we only want to create things that are of top priority. Things that absolutely should be dealt with.\n\n\n\n\n\n\n\n\nWrite the external message that will be sent to customers. This will be reviewed during the post-mortem meeting before it is sent out.\n\n\n\n\nAvoid using the word \"outage\" unless it really was a full outage, use the word \"incident\" instead. Customers generally see \"outage\" and assume everything was down, when in reality it was likely just some alerts delivered outside of SLA.\n\n\nLook at other examples of previous post-mortems to see the kind of thing you should send.\n\n\n\n\n\n\n\n\nPost-Mortem Meeting\n#\n\n\nThese meetings should generally last 15-30 minutes, and are intended to be a wrap up of the post-mortem process. We should discuss what happened, what we could've done better, and any followup actions we need to take. The goal is to suss out any disagreement on the facts, analysis, or recommended actions, and to get some wider awareness of the problems that are causing reliability issues for us.\n\n\nYou should invite the following people to the post-mortem meeting,\n\n\n\n\nAlways\n\n\nThe team leader.\n\n\nService owners involved in the incident.\n\n\nKey engineer(s)/responders involved in the incident.\n\n\n\n\n\n\nOptional\n\n\nCustomer liaison. (Only SR/IN +major incidents)\n\n\n\n\n\n\n\n\nA general agenda for the meeting would be something like,\n\n\n\n\nRecap the timeline, to make sure everyone agrees and is on the same page.\n\n\nRecap important points, and any unusual items.\n\n\nDiscuss how the problem could've been caught.\n\n\nDid it send any weak signals?\n\n\nCould it have been caught in tests, or loadtest environment?\n\n\n\n\n\n\nDiscuss customer impact. Any comments from customers, etc.\n\n\nReview action items that have been created, discuss if appropriate, or if more are needed, etc.\n\n\n\n\nExamples\n#\n\n\nHere are some examples of post-mortems from other companies as a reference,\n\n\n\n\nStripe\n\n\nLastPass\n\n\nAWS\n\n\nTwilio\n\n\nHeroku\n\n\nNetflix\n\n\nGOV.UK Rail Accident Investigation\n\n\nA List of Post-mortems!\n\n\n\n\nUseful Resources\n#\n\n\n\n\nAdvanced PostMortem Fu and Human Error 101 (Velocity 2011)\n\n\nBlame. Language. Sharing.", 
+            "text": "For every major issue (SR/IN +major), we need to follow up with a post-mortem. A blame-free, detailed description, of exactly what went wrong in order to cause the incident, along with a list of steps to take in order to prevent a similar incident from occurring again in the future. The incident response process itself should also be included.\n\n\n\n\nOwner Designation\n#\n\n\nThe first step is that a post-mortem owner will be designated. This is done by the TL either at the end of a major incident call, or very shortly after. You will be notified directly by the TL if you are the owner for the post-mortem. The owner is responsible for populating the post-mortem page, looking up logs, managing the followup investigation, and keeping all interested parties in the loop. Please use DoIT and our internal Chat for coordinating followup. A detailed list of the steps is available below,\n\n\nOwner Responsibilities\n#\n\n\nAs owner of a post-mortem, you are responsible for the following,\n\n\n\n\nScheduling the post-mortem meeting (on a shared calendar) and inviting the relevant people (this should be scheduled within 5 business days of the incident).\n\n\nUpdating the page with all of the necessary content.\n\n\nInvestigating the incident, pulling in whomever you need from other teams to assist in the investigation.\n\n\nCreating follow-up DoIT cards (\nYou are only responsible for creating the cards, not following them up to resolution\n).\n\n\nRunning the post-mortem meeting (\nthese generally run themselves, but you should get people back on topic if the conversation starts to wander\n).\n\n\nIn cases where we need a public blog post, creating \n reviewing it with appropriate parties.\n\n\n\n\nPost-Mortem Wiki Page\n#\n\n\nOnce you've been designated as the owner of a post-mortem, you should start updating the page with all the relevant information.\n\n\n\n\n\n\n(If not already done by the TL) Create a new post-mortem page for the incident.\n\n\n\n\n\n\nSchedule a post-mortem meeting for within 5 business days of the incident. You should schedule this before filling in the page, just so it's on the calendar.\n\n\n\n\nCreate the meeting on the \"Incident Post-Mortem Meetings\" shared calendar.\n\n\n\n\n\n\n\n\nBegin populating the page with all of the information you have.\n\n\n\n\nThe timeline should be the main focus to begin with.\n\n\nThe timeline should include important changes in status/impact, and also key actions taken by responders.\n\n\nYou should mark the start of the incident in red, and the resolution in green (for when we went into/out of SR/IN +major).\n\n\n\n\n\n\nGo through the history in DoIT and Slack to identify the responders, and add them to the page.\n\n\nIdentify the Team Leader and Scribe in this list.\n\n\n\n\n\n\n\n\n\n\n\n\nPopulate the page with more detailed information.\n\n\n\n\nFor each item in the timeline, identify a metric, or some third-party page where the data came from. This could be a link to a Check_MK graph, a logwatch search, a Tweet, etc. Anything which shows the data point you're trying to illustrate in the timeline.\n\n\n\n\n\n\n\n\nPerform an analysis of the incident.\n\n\n\n\nCapture all available data regarding the incident. What caused it, how many customers were affected, etc.\n\n\nAny commands or queries you use to look up data should be posted in the page so others can see how the data was gathered.\n\n\nCapture the impact to customers (generally in terms of event submission, delayed processing, and slow notification delivery)\n\n\nIdentify the underlying cause of the incident (What happened, and why did it happen).\n\n\n\n\n\n\n\n\nCreate any followup action DoIT cards (or note down topics for discussion if we need to decide on a direction to go before creating tickets),\n\n\n\n\nGo through the history in DoIT, Slack to identify any TODO items.\n\n\nLabel all tickets with their severity level and date tags.\n\n\nAny actions which can reduce re-occurrence of the incident.\n\n\n(There may be some trade-off here, and that's fine. Sometimes the ROI isn't worth the effort that would go into it).\n\n\n\n\n\n\nIdentify any actions which can make our incident response process better.\n\n\nBe careful with creating too many cards. Generally we only want to create things that are of top priority. Things that absolutely should be dealt with.\n\n\n\n\n\n\n\n\nWrite the external message that will be sent to customers. This will be reviewed during the post-mortem meeting before it is sent out.\n\n\n\n\nAvoid using the word \"outage\" unless it really was a full outage, use the word \"incident\" instead. Customers generally see \"outage\" and assume everything was down, when in reality it was likely just some alerts delivered outside of SLA.\n\n\nLook at other examples of previous post-mortems to see the kind of thing you should send.\n\n\n\n\n\n\n\n\nPost-Mortem Meeting\n#\n\n\nThese meetings should generally last 15-30 minutes, and are intended to be a wrap up of the post-mortem process. We should discuss what happened, what we could've done better, and any followup actions we need to take. The goal is to suss out any disagreement on the facts, analysis, or recommended actions, and to get some wider awareness of the problems that are causing reliability issues for us.\n\n\nYou should invite the following people to the post-mortem meeting,\n\n\n\n\nAlways\n\n\nThe team leader.\n\n\nService owners involved in the incident.\n\n\nKey engineer(s)/responders involved in the incident.\n\n\n\n\n\n\nOptional\n\n\nCustomer liaison. (Only SR/IN +major incidents)\n\n\n\n\n\n\n\n\nA general agenda for the meeting would be something like,\n\n\n\n\nRecap the timeline, to make sure everyone agrees and is on the same page.\n\n\nRecap important points, and any unusual items.\n\n\nDiscuss how the problem could've been caught.\n\n\nDid it send any weak signals?\n\n\nCould it have been caught in tests, or loadtest environment?\n\n\n\n\n\n\nDiscuss customer impact. Any comments from customers, etc.\n\n\nReview action items that have been created, discuss if appropriate, or if more are needed, etc.\n\n\n\n\nExamples\n#\n\n\nHere are some examples of post-mortems from other companies as a reference,\n\n\n\n\nStripe\n\n\nLastPass\n\n\nAWS\n\n\nTwilio\n\n\nHeroku\n\n\nNetflix\n\n\nGOV.UK Rail Accident Investigation\n\n\nA List of Post-mortems!\n\n\n\n\nUseful Resources\n#\n\n\n\n\nAdvanced PostMortem Fu and Human Error 101 (Velocity 2011)\n\n\nBlame. Language. Sharing.", 
             "title": "Post-Mortem Process"
         }, 
         {
             "location": "/after/post_mortem_process/#owner-designation", 
-            "text": "The first step is that a post-mortem owner will be designated. This is done by the TL either at the end of a major incident call, or very shortly after. You will be notified directly by the TL if you are the owner for the post-mortem. The owner is responsible for populating the post-mortem page, looking up logs, managing the followup investigation, and keeping all interested parties in the loop. Please use DoIT and Slack for coordinating followup. A detailed list of the steps is available below,", 
+            "text": "The first step is that a post-mortem owner will be designated. This is done by the TL either at the end of a major incident call, or very shortly after. You will be notified directly by the TL if you are the owner for the post-mortem. The owner is responsible for populating the post-mortem page, looking up logs, managing the followup investigation, and keeping all interested parties in the loop. Please use DoIT and our internal Chat for coordinating followup. A detailed list of the steps is available below,", 
             "title": "Owner Designation"
         }, 
         {
@@ -442,7 +437,7 @@
         }, 
         {
             "location": "/after/post_mortem_template/", 
-            "text": "This is a standard template for post-mortems. Each section describes the type of information you will want to put in that section.\n\n\n\n\n\n\nGuidelines\n\n\nThis page is intended to be reviewed during a post-mortem meeting that should be scheduled within 5 business days of any event.\nYour first step should be to schedule the post-mortem meeting in the shared calendar for within 5 business days after the incident.\nDon't wait until you've filled in the info to schedule the meeting, however make sure the page is completed by the meeting.\n\n\n\n\n Post-Mortem Owner:\n \nYour name goes here.\n\n\n Meeting Scheduled For:\n \nSchedule the meeting on the \"Incident Post-Mortem Meetings\" shared calendar, for within 5 business days after the incident. Put the date/time here.\n\n\n Call Recording:\n \nLink to the incident call recording / slack transcript or DoIT card.\n\n\nOverview\n#\n\n\nInclude a \nshort\n sentence or two summarizing the root cause, timeline summary, and the impact. E.g. \"On the morning of August 99th, we suffered a 1 minute IN-3  due to a runaway process on our primary database machine. This slowness caused roughly 0.024% of alerts that had begun during this time to be delivered out of SLA.\"\n\n\nWhat Happened\n#\n\n\nInclude a short description of what happened.\n\n\nRoot Cause\n#\n\n\nInclude a description of the root cause. If there were any actions taken that exacerbated the issue, also include them here with the intention of learning from any mistakes made during the resolution process.\n\n\nResolution\n#\n\n\nInclude a description what solved the problem. If there was a temporary fix in place, describe that along with the long-term solution.\n\n\nImpact\n#\n\n\nBe very specific here, include exact numbers.\n\n\n\n\n\n\n\n\nTime in SR-3\n\n\n?mins\n\n\n\n\n\n\n\n\n\n\nNotifications Delivered out of SLA\n\n\n??% (?? of ??)\n\n\n\n\n\n\nEvents Dropped / Not Accepted\n\n\n??% (?? of ??) \nShould usually be 0, but always check\n\n\n\n\n\n\nAccounts Affected\n\n\n??\n\n\n\n\n\n\nUsers Affected\n\n\n??\n\n\n\n\n\n\nSupport Requests Raised\n\n\n?? \nInclude any relevant links to tickets\n\n\n\n\n\n\n\n\nResponders\n#\n\n\n\n\nWho was the TL?\n\n\nWho was the scribe?\n\n\nWho else was involved?\n\n\nWho else was involved?\n\n\n\n\nTimeline\n#\n\n\nSome important times to include: (1) time the root cause began, (2) time of the page, (3) time that the status page was updated (i.e. when the incident became public), (4) time of any significant actions, (5) time the IN-3 ended, (6) links to tools/logs that show how the timestamp was arrived at.\n\n\n\n\n\n\n\n\nTime (UTC)\n\n\nEvent\n\n\nData Link\n\n\n\n\n\n\n\n\n\n\nHow'd We Do?\n#\n\n\nWhat Went Well?\n#\n\n\n\n\nList anything you did well and want to call out. It's OK to not list anything.\n\n\n\n\nWhat Didn't Go So Well?\n#\n\n\n\n\nList anything you think we didn't do very well. The intent is that we should follow up on all points here to improve our processes.\n\n\n\n\nAction Items\n#\n\n\nEach action item should be in the form of a DoIT card respectiv GTD next actions principle:  \"a clear and concise single action to move things forward\u201d. Include action items such as: (1) any fixes required to prevent the root cause in the future, (2) any preparedness tasks that could help mitigate the problem if it came up again, (3) remaining post-mortem steps, such as the internal email, as well as the status-page public post, (4) any improvements to our incident response process.\n\n\nMessaging\n#\n\n\nInternal Email\n#\n\n\nThis is a follow-up for employees. It should be sent out right after the post-mortem meeting is over. It only needs a short paragraph summarizing the incident and a link to this wiki page.\n\n\n\n\nBriefly summarize what happened and where the post-mortem page (this page) can be found.\n\n\n\n\nExternal Message\n#\n\n\nThis is what will be included on the public facing status website (status.spearhead.systems) regarding this incident. What are we telling customers, including an apology? (The apology should be genuine, not rote.)\n\n\n\n\nSummary\n\n\nWhat Happened?\n\n\nWhat Are We Doing About This?", 
+            "text": "This is a standard template for post-mortems. Each section describes the type of information you will want to put in that section.\n\n\n\n\n\n\nGuidelines\n\n\nThis page is intended to be reviewed during a post-mortem meeting that should be scheduled within 5 business days of any event.\nYour first step should be to schedule the post-mortem meeting in the shared calendar for within 5 business days after the incident.\nDon't wait until you've filled in the info to schedule the meeting, however make sure the page is completed by the meeting.\n\n\n\n\n Post-Mortem Owner:\n \nYour name goes here.\n\n\n Meeting Scheduled For:\n \nSchedule the meeting on the \"Incident Post-Mortem Meetings\" shared calendar, for within 5 business days after the incident. Put the date/time here.\n\n\n Call Recording:\n \nLink to the incident call recording / slack transcript or DoIT card.\n\n\nOverview\n#\n\n\nInclude a \nshort\n sentence or two summarizing the root cause, timeline summary, and the impact. E.g. \"On the morning of August 99th, we suffered a 1 minute IN-3  due to a runaway process on our primary database machine. This slowness caused roughly 0.024% of alerts that had begun during this time to be delivered out of SLA.\"\n\n\nWhat Happened\n#\n\n\nInclude a short description of what happened.\n\n\nRoot Cause\n#\n\n\nInclude a description of the root cause. If there were any actions taken that exacerbated the issue, also include them here with the intention of learning from any mistakes made during the resolution process.\n\n\nResolution\n#\n\n\nInclude a description what solved the problem. If there was a temporary fix in place, describe that along with the long-term solution.\n\n\nImpact\n#\n\n\nBe very specific here, include exact numbers.\n\n\n| Time in SR-3 | ?mins |\n| Time in IN-3 | ?mins |\n| Notifications Delivered out of SLA | ??% (?? of ??) |\n| Events Dropped / Not Accepted | ??% (?? of ??) \nShould usually be 0, but always check\n |\n| Accounts Affected | ?? |\n| Users Affected | ?? |\n| Support Requests Raised | ?? \nInclude any relevant links to tickets\n |\n\n\nResponders\n#\n\n\n\n\nWho was the TL?\n\n\nWho was the scribe?\n\n\nWho else was involved?\n\n\nWho else was involved?\n\n\n\n\nTimeline\n#\n\n\nSome important times to include: (1) time the root cause began, (2) time of the page, (3) time that the status page was updated (i.e. when the incident became public), (4) time of any significant actions, (5) time the IN-3 ended, (6) links to tools/logs that show how the timestamp was arrived at.\n\n\n\n\n\n\n\n\nTime (UTC)\n\n\nEvent\n\n\nData Link\n\n\n\n\n\n\n\n\n\n\nHow'd We Do?\n#\n\n\nWhat Went Well?\n#\n\n\n\n\nList anything you did well and want to call out. It's OK to not list anything.\n\n\n\n\nWhat Didn't Go So Well?\n#\n\n\n\n\nList anything you think we didn't do very well. The intent is that we should follow up on all points here to improve our processes.\n\n\n\n\nAction Items\n#\n\n\nEach action item should be in the form of a DoIT card respectiv GTD next actions principle:  \"a clear and concise single action to move things forward\u201d. Include action items such as: (1) any fixes required to prevent the root cause in the future, (2) any preparedness tasks that could help mitigate the problem if it came up again, (3) remaining post-mortem steps, such as the internal email, as well as the status-page public post, (4) any improvements to our incident response process.\n\n\nMessaging\n#\n\n\nInternal Email\n#\n\n\nThis is a follow-up for employees. It should be sent out right after the post-mortem meeting is over. It only needs a short paragraph summarizing the incident and a link to this wiki page.\n\n\n\n\nBriefly summarize what happened and where the post-mortem page (this page) can be found.\n\n\n\n\nExternal Message\n#\n\n\nThis is what will be included on the public facing status website (status.spearhead.systems) regarding this incident. What are we telling customers, including an apology? (The apology should be genuine, not rote.)\n\n\n\n\nSummary\n\n\nWhat Happened?\n\n\nWhat Are We Doing About This?", 
             "title": "Post-Mortem Template"
         }, 
         {
@@ -467,7 +462,7 @@
         }, 
         {
             "location": "/after/post_mortem_template/#impact", 
-            "text": "Be very specific here, include exact numbers.     Time in SR-3  ?mins      Notifications Delivered out of SLA  ??% (?? of ??)    Events Dropped / Not Accepted  ??% (?? of ??)  Should usually be 0, but always check    Accounts Affected  ??    Users Affected  ??    Support Requests Raised  ??  Include any relevant links to tickets", 
+            "text": "Be very specific here, include exact numbers.  | Time in SR-3 | ?mins |\n| Time in IN-3 | ?mins |\n| Notifications Delivered out of SLA | ??% (?? of ??) |\n| Events Dropped / Not Accepted | ??% (?? of ??)  Should usually be 0, but always check  |\n| Accounts Affected | ?? |\n| Users Affected | ?? |\n| Support Requests Raised | ??  Include any relevant links to tickets  |", 
             "title": "Impact"
         }, 
         {
@@ -802,7 +797,7 @@
         }, 
         {
             "location": "/about/", 
-            "text": "This site documents parts of the Spearhead Systems Issue Response process. It is a cut-down version of our internal documentation, used at Spearhead Systems for any incident or service request, and to prepare new employees for on-call responsibilities. It provides information not only on preparation but also what to do during and after.\n\n\nThis documentation is complementary to what is available in our \nexisting wiki\n.\n\n\nWhat is this?\n#\n\n\nA collection of pages detailing how to efficiently deal with any incident or service request that might arise, along with information on how to go on-call effectively. It provides lessons learned the hard way, along with training material for getting you up to speed quickly.\n\n\nWho is this for?\n#\n\n\nIt is intended for on-call practitioners and those involved in an operational incident or service request response process, or those wishing to enact a formal incident response process. Specifically this is for all of our Technical Support staff.\n\n\nWhy do I need it?\n#\n\n\nAs a service provider Spearhead Systems deals with service requests on a daily basis. The reason we exist is to deliver a service which in most cases boils down to incidents and service requests. We want to deliver a smooth and seamless experience for resolving our customers issues therefore this documentation is a guideline for how we handle these requests. This documentation will allow you give you a head start on how to deal with issues in a way which leads to the fastest possible recovery time.\n\n\nWhat is covered?\n#\n\n\nAnything from preparing to \ngo on-call\n, definitions of \nseverities\n, incident \ncall etiquette\n, all the way to how to run a \npost-mortem\n, providing a \npost-mortem template\n and even a \nsecurity incident response process\n.\n\n\nWhat is missing?\n#\n\n\nLots, dig in an help us complete the picture. We can migrate most processes from Sharepoint here.\n\n\nLicense\n#\n\n\nThis documentation is provided under the Apache License 2.0. In plain English that means you can use and modify this documentation and use it both commercially and for private use. However, you must include any original copyright notices, and the original LICENSE file.\n\n\nWhether you are a Spearhead Systems customer or not, we want you to have the ability to use this documentation internally at your own company. You can view the source code for all of this documentation on our GitHub account, feel free to fork the repository and use it as a base for your own internal documentation.", 
+            "text": "This site documents parts of the Spearhead Systems technical support response process. It is a cut-down version of our internal documentation, used at Spearhead Systems for any incident or service request, and to prepare new employees for on-call responsibilities. It provides information not only on preparation but also what to do during and after.\n\n\nThis documentation is complementary to what is available in our \nexisting wiki\n.\n\n\nWhat is this?\n#\n\n\nA collection of pages detailing how to efficiently deal with any incident or service request that might arise, along with information on how to go on-call effectively. It provides lessons learned the hard way, along with training material for getting you up to speed quickly.\n\n\nWho is this for?\n#\n\n\nIt is intended for our technical support staff and customers/partners looking for more details regarding our support process. \n\n\nWhy do I need it?\n#\n\n\nAs a service provider Spearhead Systems deals with technical support requests on a daily basis. The reason we exist is to deliver our technical support services which boils down to responsind to incidents and service requests. We want to deliver a smooth and seamless experience for resolving our customers issues therefore this documentation is a guideline for how we handle these requests. This documentation will give you a head start on how to deal with issues in a way which leads to the fastest possible recovery time.\n\n\nWhat is covered?\n#\n\n\nAnything from preparing to \ngo on-call\n, definitions of \nseverities\n, incident \ncall etiquette\n, all the way to how to run a \npost-mortem\n, providing a \npost-mortem template\n and even a \nsecurity incident response process\n.\n\n\nWhat is missing?\n#\n\n\nLots, dig in an help us complete the picture. We can migrate most processes from Sharepoint here. We're also looking for experienced operations/support people who are willing to share their experience with us and help us provide a better support service.\n\n\nLicense\n#\n\n\nThis documentation is provided under the Apache License 2.0. In plain English that means you can use and modify this documentation and use it both commercially and for private use. However, you must include any original copyright notices, and the original LICENSE file.\n\n\nWhether you are a Spearhead Systems customer or not, we want you to have the ability to use this documentation internally at your own company. You can view the source code for all of this documentation on our GitHub account, feel free to fork the repository and use it as a base for your own internal documentation.\n\n\nPlease also check-out \nPagerDuty's\n response documentation which has made our own efforts in documenting our process much easier.", 
             "title": "About"
         }, 
         {
@@ -812,12 +807,12 @@
         }, 
         {
             "location": "/about/#who-is-this-for", 
-            "text": "It is intended for on-call practitioners and those involved in an operational incident or service request response process, or those wishing to enact a formal incident response process. Specifically this is for all of our Technical Support staff.", 
+            "text": "It is intended for our technical support staff and customers/partners looking for more details regarding our support process.", 
             "title": "Who is this for?"
         }, 
         {
             "location": "/about/#why-do-i-need-it", 
-            "text": "As a service provider Spearhead Systems deals with service requests on a daily basis. The reason we exist is to deliver a service which in most cases boils down to incidents and service requests. We want to deliver a smooth and seamless experience for resolving our customers issues therefore this documentation is a guideline for how we handle these requests. This documentation will allow you give you a head start on how to deal with issues in a way which leads to the fastest possible recovery time.", 
+            "text": "As a service provider Spearhead Systems deals with technical support requests on a daily basis. The reason we exist is to deliver our technical support services which boils down to responsind to incidents and service requests. We want to deliver a smooth and seamless experience for resolving our customers issues therefore this documentation is a guideline for how we handle these requests. This documentation will give you a head start on how to deal with issues in a way which leads to the fastest possible recovery time.", 
             "title": "Why do I need it?"
         }, 
         {
@@ -827,12 +822,12 @@
         }, 
         {
             "location": "/about/#what-is-missing", 
-            "text": "Lots, dig in an help us complete the picture. We can migrate most processes from Sharepoint here.", 
+            "text": "Lots, dig in an help us complete the picture. We can migrate most processes from Sharepoint here. We're also looking for experienced operations/support people who are willing to share their experience with us and help us provide a better support service.", 
             "title": "What is missing?"
         }, 
         {
             "location": "/about/#license", 
-            "text": "This documentation is provided under the Apache License 2.0. In plain English that means you can use and modify this documentation and use it both commercially and for private use. However, you must include any original copyright notices, and the original LICENSE file.  Whether you are a Spearhead Systems customer or not, we want you to have the ability to use this documentation internally at your own company. You can view the source code for all of this documentation on our GitHub account, feel free to fork the repository and use it as a base for your own internal documentation.", 
+            "text": "This documentation is provided under the Apache License 2.0. In plain English that means you can use and modify this documentation and use it both commercially and for private use. However, you must include any original copyright notices, and the original LICENSE file.  Whether you are a Spearhead Systems customer or not, we want you to have the ability to use this documentation internally at your own company. You can view the source code for all of this documentation on our GitHub account, feel free to fork the repository and use it as a base for your own internal documentation.  Please also check-out  PagerDuty's  response documentation which has made our own efforts in documenting our process much easier.", 
             "title": "License"
         }
     ]
diff --git a/oncall/alerting_principles/index.html b/oncall/alerting_principles/index.html
index bb765f6..78e608a 100644
--- a/oncall/alerting_principles/index.html
+++ b/oncall/alerting_principles/index.html
@@ -445,7 +445,7 @@
           <p>We manage how we get alerted based on many factors such as the customers contractual SLA, the urgency of their request or incident, etc.. <strong>an alert or notification is something which requires a human to perform an action</strong>. Based on the severity of the issue (service request or incident) we prioritize accordingly in <a href="http://doit.sphs.ro">DoIT</a>.</p>
 <div class="admonition warning">
 <p class="admonition-title">Major Priority Alerts</p>
-<p>Anything that wakes up a human in the middle of the night should be <strong>immediately human actionable</strong>. If it is none of those things, then we need to adjust the alert to not page at those times.</p>
+<p>Anything that wakes up a human in the middle of the night should be <strong>immediately human actionable</strong>. If it is none of those things, then we need to adjust the alert to not bother us at those times.</p>
 </div>
 <table>
 <thead>
@@ -478,12 +478,12 @@
 </tr>
 </tbody>
 </table>
-<p>Both IN and SR (incidents, service requests) share the same priorities. The actual response / resolution times vary and are based upon contractual agreements with the customer. These details (SLA) are available in DoIT on the organization page of the respective customer.</p>
+<p>Both IN and SR (incidents, service requests) share the same priorities. The actual response / resolution times vary and are based upon contractual agreements with the customer. These details (SLA) are available in DoIT on the organization page.</p>
 <p>If you're setting up a new alert/notification, consider the chart above for how you want to alert people. Be mindful of not creating new high-priority alerts if they don't require an immediate response, for example.</p>
 <div class="admonition info">
 <p class="admonition-title">Alert Channels</p>
-<p>Presently we use email as the only notification method. This means keeping an eye on your email is essential!
-SMS and Push notifications are in the pipeline for DoIT.  </p>
+<p>Primarily we use email as the notification/alert methods and all of our customers are encouraged to use this method. Secondly there is the DoIT customer portal which will send alerts to the on-call person(s) and escalate based on SLA/contractual agreements. Thirdly we use our centralized support telephone number and individual phones. This means keeping an eye on your email is essential!</p>
+<p>SMS and Push notifications are in the pipeline for DoIT.  </p>
 </div>
 <h2 id="examples">Examples<a class="headerlink" href="#examples" title="Permanent link">#</a></h2>
 <h4 id="production-service-is-failing-for-75-of-requests-automation-is-unable-to-resolve_">"Production service is failing for 75% of requests, automation is unable to resolve."_<a class="headerlink" href="#production-service-is-failing-for-75-of-requests-automation-is-unable-to-resolve_" title="Permanent link">#</a></h4>
diff --git a/oncall/being_oncall/index.html b/oncall/being_oncall/index.html
index 2b2f83f..8c8f77f 100644
--- a/oncall/being_oncall/index.html
+++ b/oncall/being_oncall/index.html
@@ -469,24 +469,32 @@
           <p>A summary of expectations and helpful information for being on-call.</p>
 <p><img alt="Alert Fatigue" src="../../assets/img/misc/alert_fatigue.png" /></p>
 <h2 id="what-is-on-call">What is On-Call?<a class="headerlink" href="#what-is-on-call" title="Permanent link">#</a></h2>
-<p>At Spearhead being on-call means that you are able to be contacted at any time in order to investigate and fix issues that may arise. There are two on-call scenarios that you will deal with:</p>
+<p>At Spearhead, being on-call means that you are responsible for monitoring our communications channels and responding to requests at any time. There are two on-call scenarios that you will deal with:</p>
 <ul>
 <li>during your normal work shift</li>
-<li>being on-call for outside working hours</li>
+<li>outside working hours</li>
 </ul>
-<p>For example, if you are on-call outside normal working hours, should any alarms be triggered by our monitoring solution, you will receive a "page" (an alert on your mobile device, email, phone call, or SMS, etc.) giving you details on what has broken. You will be expected to take whatever actions are necessary in order to resolve the issue and return your service to a normal state. </p>
-<p>At Spearhead Systems we consider you are on-call during normal working hours in which case you are proactively working with <a href="http://doit.sphs.ro/">DoIT</a> and looking over your assigned cards/boards as well as when you are formally "on-call" and issues are being redirected to you.</p>
-<p>On-call responsibilities extend beyond normal office hours, and if you are on-call you are expected to be able to respond to issues, even at 2am. This sounds horrible (and it can be), but this is what our customers go through, and is the problem that the Spearhead Systems professional services is trying to fix!</p>
+<p>For example, if you are on-call outside normal working hours, should any alarms be triggered by our monitoring solution or a customer emails our support channel, you will receive a "notification" (an alert on your mobile device, email, phone call, or SMS, etc.) giving you details on what has broken. 
+You will be expected to gather as much information as possible, create the required cards in our ticketing systems, delegate or assign the card to the right person/watchers and otherwise take whatever actions are necessary in order to resolve the issue. </p>
+<!-- At Spearhead Systems we consider you are on-call during normal working hours in which case you are proactively working with [DoIT](http://doit.sphs.ro/) and looking over your assigned cards/boards as well as when you are formally "on-call" and issues are being redirected to you. -->
+
+<p>On-call responsibilities extend beyond normal office hours, and if you are on-call you are expected to be able to respond to issues, even at 2am. This sounds horrible (and it can be), but this is what our customers go through, and is the problem that the Spearhead Systems technical support services is trying to fix!</p>
+<p>When you are on-call during normal working hours you are the central contact for our entire support team. We expect you will delegate and assign the card to your colleagues and only attempt to resolve issues if your current workload permits. 
+When you are on-call outside working hours you are expected to handle as much of the process as possible and delegate only if it is outside your area of expertise or you encounter problems that require another colleagues input.</p>
+<div class="admonition note">
+<p class="admonition-title">When in the office</p>
+<p>You are generally speaking on-call during your normal working hours even if you are not <em>the</em> on-call engineer. This means you are keeping an eye on the cards assigned to you directly or that you are a watcher for. If you are ever in a position that you have no assigned cards and it is not clear what to work on ask a TL or senior Sysadmin to help point you in the right direction.</p>
+</div>
 <h2 id="responsibilities">Responsibilities<a class="headerlink" href="#responsibilities" title="Permanent link">#</a></h2>
 <ol>
 <li>
 <p><strong>Prepare</strong></p>
 <ul>
-<li>Have your laptop and Internet with you (office, home, a MiFi dongle, a phone with a tethering plan, etc).<ul>
-<li>Have a way to charge your MiFi.</li>
+<li>Have your laptop and Internet with you (office, home, a phone with a tethering plan, etc).<ul>
+<li>Have a way to charge your phone.</li>
 </ul>
 </li>
-<li>Team alert escalation happens within 5 minutes, set/stagger your notification timeouts (push, SMS, phone...) accordingly.<ul>
+<li>Team alert escalation happens within 30 minutes, set/stagger your notification timeouts (push, SMS, phone...) accordingly.<ul>
 <li>Make sure Spearhead Systems (and colleagues directly) texts and calls can bypass your "Do Not Disturb" settings.</li>
 </ul>
 </li>
@@ -501,10 +509,10 @@
 <li>Acknowledge and act on alerts whenever you can (see the first "Not responsibilities" point below)</li>
 <li>Determine the urgency of the problem:<ul>
 <li>Is it something that should be worked on right now or escalated into a major incident? ("production server on fire" situations. Security alerts) - do so.</li>
-<li>Is it some tactical work that doesn't have to happen during the night? (for example, disk utilization high watermark, but there's plenty of space left and the trend is not indicating impending doom) - snooze the alert until a more suitable time (working hours, the next morning...) and get back to fixing it then.</li>
+<li>Is it some tactical work that doesn't have to happen during the night? (for example, disk utilization high watermark, but there's plenty of space left and the trend is not indicating impending doom) - snooze the issue until a more suitable time (working hours, the next morning...) and get back to fixing it then.</li>
 </ul>
 </li>
-<li>Check Slack for current activity. Often (but not always) actions that could potentially cause alerts will be announced there.</li>
+<li>Check our <em>internal Chat</em> for current activity. Often (but not always) actions that could potentially cause alerts will be announced there.</li>
 <li>Does the alert and your initial investigation indicate a general problem or an issue with a specific service that the relevant team should look into? If it does not look like a problem you are the expert for, then escalate to another team member or group.</li>
 </ul>
 </li>
@@ -513,14 +521,15 @@
 <ul>
 <li>You are empowered to dive into any problem and act to fix it.</li>
 <li>Involve other team members as necessary: do not hesitate to escalate if you cannot figure out the cause within a reasonable timeframe or if the service / alert is something you have not tackled before.</li>
-<li>If the issue is not very time sensitive and you have other priority work, make a note of this in DoIT to keep a track of it (with an appropriate severity and due date).</li>
+<li>If the issue is not very time sensitive and you have other priority work, make a note of this in DoIT to keep a track of it (with an appropriate severity, comment and due date).</li>
 </ul>
 </li>
 <li>
 <p><strong>Improve</strong></p>
 <ul>
 <li>If a particular issue keeps happening; if an issue alerts often but turns out to be a preventable non-issue – perhaps improving this should be a longer-term task.<ul>
-<li>Disks that fill up, logs that should be rotated, noisy alerts...(we use ansible, go ahead and start automating!)</li>
+<li>Disks that fill up, logs that should be rotated, noisy alerts...(we use ansible and rundeck, go ahead and start automating!)</li>
+<li>When we perform a DoD (definition of done) this is good time to bring up recurring issues for discussion.</li>
 </ul>
 </li>
 <li>If information is difficult / impossible to find, write it down. Constantly refactor and improve our knowledge base and documentation. Add redundant links and pointers if your mental model of the wiki / codebase does not match the way it is currently organized.</li>
@@ -530,12 +539,12 @@
 <p><strong>Support</strong></p>
 <ul>
 <li>When your on-call "shift" ends, let the next on-call and team know about issues that have not been resolved yet and other experiences of note.<ul>
-<li>Make an effort to cleanly handover necessary information. We use Slack, email and DoIT to communicate. </li>
-<li>This is a best-practice that should be applied whenever there are details that by sharong would benefit the efficiency of the team.</li>
+<li>Make an effort to cleanly handover necessary information. We use <em>internal Chat</em>, email and DoIT to communicate. </li>
+<li>This is a best-practice that should be applied whenever there are details that by sharing would benefit the efficiency of the team.</li>
 </ul>
 </li>
 <li>If you are making a change that impacts the schedule (adding / removing yourself, for example), let others know since many of us make arrangements around the on-call schedule well in advance.</li>
-<li>Support each other: when doing activities that might generate plenty of pages, it is courteous to "take the page" away from the on-call by notifying them and scheduling an override for the duration.</li>
+<li>Support each other: when doing activities that might generate plenty of alerts, it is courteous to "place the service/host in maintenance" and take it away from the on-call by notifying them and scheduling an override for the duration.</li>
 </ul>
 </li>
 </ol>
@@ -567,41 +576,38 @@
 <p><img alt="Escalation" src="../../assets/img/misc/escalation.png" /></p>
 <ul>
 <li>
-<p>Team leaders (TL) can (and should) be part of your normal rotation. It gives a better insight into what has been going on.</p>
+<p>Team leaders (TL) are a part of our normal rotation. It gives a better insight into what has been going on.</p>
 </li>
 <li>
-<p>New members of the team should shadow your on-call rotation during the first few weeks. They should get all alerts, and should follow along with what you are doing. (All new employees shadow the Support team for one week of on-call, but it's useful to have new team members shadow your team rotations also. Just not at the same time).</p>
+<p>New members of the team should shadow your on-call rotation during the first few weeks. They should get all alerts, and should follow along with what you are doing. (All new employees shadow the Support team for one week of on-call, but it's useful to have new team members shadow your team rotations also.).</p>
 </li>
-<li>
-<p>Our escalation timeout is set to 5 minutes. This is usually plenty of time for someone to acknowledge the incident if they're able to. If they're not able to within 5 minutes, then they're probably not in a good position to respond to the incident anyway.</p>
-<ul>
-<li>Triggering an escalation is done automatically in most situations based on the type, priority and severity of the issue.</li>
 </ul>
-</li>
-<li>
-<p>When going off-call, you should provide a quick summary to the next on-call about any issues that may come up during their shift. A service has been flapping, an issue is likely to re-occur, etc. If you want to be formal, this can be a written report via email, but generally a verbal summary is sufficient.</p>
-</li>
+<!-- // we do not uet implement escalation for incidents, not automatically // * Our escalation timeout is set to 5 minutes. This is usually plenty of time for someone to acknowledge the incident if they're able to. If they're not able to within 5 minutes, then they're probably not in a good position to respond to the incident anyway.
+    * Triggering an escalation is done automatically in most situations based on the type, priority and severity of the issue.
+    * Escalations only happen to incidents! Service Requests must be manually escalated based on customer input -->
+
+<ul>
+<li>When going off-call, you should provide a quick summary to the next on-call about any issues that may come up during their shift. A service has been flapping, an issue is likely to re-occur, etc. If you want to be formal, this can be a written report via email, but generally a verbal summary during our morning stand-up is sufficient.</li>
 </ul>
 <h3 id="notification-method-recommendations">Notification Method Recommendations<a class="headerlink" href="#notification-method-recommendations" title="Permanent link">#</a></h3>
 <p>You are free to set up your notification rules as you see fit, to match how you would like to best respond to incidents. If you're not sure how to configure them, the Support team has some recommendations,</p>
 <p><img alt="Mobile Alerts" src="../../assets/img/misc/mobile_alerts.png" /></p>
-<ul>
-<li>Use Push Notification and Email as your first method of notification. Most of us have phones with us at all times, so this is a prudent first method and is usually sufficient. (DoIT is in the process of integratoin with SNS for push notifications)</li>
-<li>Use Phone and/or SMS notification each minute after, until the escalation time. If Push didn't work, then it's likely you need something stronger, like a phone call. Keep calling every minute until it's too late. If you don't pick up by the 3rd time, then it's unlikely you are able to respond, and the incident will get escalated away from you.</li>
-</ul>
+<!-- // still working on integration for SMS // * Use Push Notification and Email as your first method of notification. Most of us have phones with us at all times, so this is a prudent first method and is usually sufficient. (DoIT is in the process of integration with SNS for push notifications)
+* Use Phone and/or SMS notification each minute after, until the escalation time. If Push didn't work, then it's likely you need something stronger, like a phone call. Keep calling every minute until it's too late. If you don't pick up by the 3rd time, then it's unlikely you are able to respond, and the incident will get escalated away from you. -->
+
 <h2 id="etiquette">Etiquette<a class="headerlink" href="#etiquette" title="Permanent link">#</a></h2>
 <ul>
 <li>
 <p>If the current on-call comes into the office at 12pm looking tired, it's not because they're lazy. They probably got paged in the night. Cut them some slack and be nice.</p>
 </li>
 <li>
-<p>Don't acknowledge an incident out from under someone else. If you didn't get paged for the incident, then you shouldn't be acknowledging it. Add a comment with your notes instead.</p>
+<p>Don't close or otherwise modify a card out from under someone else. If you didn't get that specific card assigned to you as owner or a watcher, then you shouldn't be modifying it. Add a comment with your notes instead in the monitoring system and in DoIT.</p>
 </li>
 </ul>
 <p><img alt="Acknowledging" src="../../assets/img/misc/ack.png" /></p>
 <ul>
 <li>
-<p>If you are testing something, or performing an action that you know will cause a page (notification, alert), it's customary to "take the pager" for the time during which you will be testing. Notify the person on-call that you are taking the pager for the next hour while you test.</p>
+<p>If you are testing something, or performing an action that you know will cause an alert from our monitoring or possibly may be identified as an issue by our customers, it's customary to "place the host/service in downtime" and announce all the involved parties, for the time during which you will be testing. Notify the person on-call so they are aware of your testing.</p>
 </li>
 <li>
 <p>"Never hesitate to escalate" - Never feel ashamed to rope in someone else if you're not sure how to resolve an issue. Likewise, never look down on someone else if they ask you for help.</p>
@@ -610,7 +616,7 @@
 <p>Always consider covering an hour or so of someone else's on-call time if they request it and you are able. We all have lives which might get in the way of on-call time, and one day it might be you who needs to swap their on-call time in order to have a night out with your friend from out of town.</p>
 </li>
 <li>
-<p>If an issue comes up during your on-call shift for which you got paged, you are responsible for resolving it. Even if it takes 3 hours and there's only 1 hour left of your shift. You can hand over to the next on-call if they agree, but you should never assume that's possible.</p>
+<p>If an issue comes up during your on-call shift for which you got called, you are responsible for resolving it. Even if it takes 3 hours and there's only 1 hour left of your shift. You can hand over to the next on-call if they agree, but you should never assume that's possible.</p>
 </li>
 </ul>
           <aside class="copyright" role="note">
diff --git a/sitemap.xml b/sitemap.xml
index fdabb59..b0b01bb 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -4,7 +4,7 @@
     
     <url>
      <loc>https://response.spearhead.systems/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
     
@@ -13,13 +13,13 @@
         
     <url>
      <loc>https://response.spearhead.systems/oncall/being_oncall/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/oncall/alerting_principles/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
@@ -29,19 +29,19 @@
         
     <url>
      <loc>https://response.spearhead.systems/before/severity_levels/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/before/different_roles/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/before/call_etiquette/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
@@ -51,13 +51,13 @@
         
     <url>
      <loc>https://response.spearhead.systems/during/during_an_incident/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/during/security_incident_response/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
@@ -67,13 +67,13 @@
         
     <url>
      <loc>https://response.spearhead.systems/after/post_mortem_process/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/after/post_mortem_template/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
@@ -83,37 +83,37 @@
         
     <url>
      <loc>https://response.spearhead.systems/training/overview/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/training/team_leader/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/training/sysadmin/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/training/scribe/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/training/subject_matter_expert/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
     <url>
      <loc>https://response.spearhead.systems/training/glossary/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
         
@@ -122,7 +122,7 @@
     
     <url>
      <loc>https://response.spearhead.systems/about/</loc>
-     <lastmod>2017-01-21</lastmod>
+     <lastmod>2017-08-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
     

Time in SR-3	?mins
Notifications Delivered out of SLA	??% (?? of ??)
Events Dropped / Not Accepted	??% (?? of ??) Should usually be 0, but always check
Accounts Affected	??
Users Affected	??
Support Requests Raised	?? Include any relevant links to tickets
#support	#support (on MS Teams/internal Chat)	http://response.spearhead.systems	+40728 005 263
Need an TL? Do `!tl page` in Slack			Need an TL? Use a Sysadmin!
For executive summary updates only, join #executive-summary-updates.