2021-06-15
§
|
16:55 |
<razzi> |
sudo -i wmf-auto-reimage-host -p T278423 an-master1002.eqiad.wmnet |
[analytics] |
16:53 |
<razzi> |
run uid script on an-master1002 |
[analytics] |
16:33 |
<elukey> |
restart hadoop-yarn-resourcemanager on an-master1001 |
[analytics] |
16:16 |
<razzi> |
sudo systemctl stop 'hadoop-*' on an-master1002 |
[analytics] |
16:14 |
<razzi> |
sudo systemctl stop hadoop-* on an-master1001, then realize I meant to do this on an-master1002, so start hadoop-* |
[analytics] |
16:11 |
<razzi> |
downtime an-master1002 |
[analytics] |
15:55 |
<razzi> |
sudo transfer.py an-master1001.eqiad.wmnet:/srv/hadoop/backup/hdfs-namenode-snapshot-buster-reimage-2021-06-15.tar.gz stat1004.eqiad.wmnet:/home/razzi/hdfs-namenode-fsimage |
[analytics] |
15:42 |
<razzi> |
tar -czf /srv/hadoop/backup/hdfs-namenode-snapshot-buster-reimage-$(date --iso-8601).tar.gz current on an-master1001 |
[analytics] |
15:38 |
<razzi> |
backup /srv/hadoop/name/current to /home/razzi/hdfs-namenode-snapshot-buster-reimage-2021-06-15.tar.gz on an-master1001 |
[analytics] |
15:33 |
<razzi> |
sudo -u hdfs kerberos-run-command hdfs hdfs dfsadmin -saveNamespace |
[analytics] |
15:27 |
<razzi> |
sudo -u hdfs kerberos-run-command hdfs hdfs dfsadmin -safemode enter |
[analytics] |
15:25 |
<razzi> |
kill running yarn applications via for loop |
[analytics] |
15:11 |
<razzi> |
sudo -u yarn kerberos-run-command yarn yarn rmadmin -refreshQueues |
[analytics] |
15:09 |
<razzi> |
disable puppet on an-mastesr |
[analytics] |
15:08 |
<razzi> |
run puppet on an-masters to update capacity-scheduler.xml |
[analytics] |
15:02 |
<razzi> |
disable puppet on an-masters |
[analytics] |
15:01 |
<razzi> |
sudo -u yarn kerberos-run-command yarn yarn rmadmin -refreshQueues to stop queues |
[analytics] |
14:35 |
<razzi> |
disable jobs that use hadoop on an-launcher1002 following https://phabricator.wikimedia.org/T278423#7094641 |
[analytics] |