AIX主機監控腳本
該腳本包含對主機的CPU、MEMROY、IO、NET、HACMP、ERROR REPORT監控。
對數據庫的表空間、JOB、ALERT LOG等的監控。
001
#!/bin/sh
002
003
# 首先載入配置文件,如果配置文件不存在,則報錯退出
004
SOURCE=$HOME/config/config
005
check_source()
006
{
007
if [ -r $SOURCE ]; then
008
. $SOURCE
009
else
010
echo "$(basename $0): Cannot locate the default setting file."
011
exit 1
012
fi
013
}
014
015
# 定義報表頭
016
report_header()
017
{
018
HOSTIP=$(ifconfig -a | sed -n '2p' |awk '{print $2}')
019
HOSTNAME=$(hostname)
020
USER=`who am i | cut -d " " -f1`
021
cat<Hostname: $HOSTNAME Server: $HOSTIP
022
User: $USER Time: $(date +%Y'-'%m'-'%d' '%H':'%M':'%S)
023
024
SYSTEM CHECK REPORT
025
===================
026
027
!
028
}
029
030
# 定義日志文件存放的目錄和日志文件名,將當前用戶目錄設置為LOG_PATH
031
LOG_PATH=$(echo $HOME)
032
LOG_FILE=$LOG_PATH/log`date +%Y%m%d%H%M%S`
033
034
# 備份歷史文件
035
cd $LOG_PATH
036
test -f log2007*
037
if [ "$?" -eq 0 ];then
038
mv $LOG_PATH/log2007* $LOG_PATH/niyl/ >/dev/null 2>&1
039
else
040
:
041
fi
042
043
#define temp directory ,if not exist,create temp directory first.
044
TEMP_PATH=$LOG_PATH/temp
045
if [ -d $TEMP_PATH ];then
046
:
047
else
048
mkdir $TEMP_PATH
049
fi
050
051
# 載入環境設置
052
check_source
053
054
# 輸出報表頭信息
055
report_header >>$LOG_FILE
056
057
058
# 檢查 CPU的使用情況
059
echo "***************************************** Check CPU *****************************************">>$LOG_FILE
060
vmstat 1 10 | awk '{print $0;if($1 ~ /^[0-9].*/) (totalcpu+=$16);(avecpu=100-totalcpu/10)}; END {print "The average usage of cpu is :"avecpu}' >$TEMP_PATH/cpu_info
061
062
cat $TEMP_PATH/cpu_info >>$LOG_FILE
063
064
cpu_used_pct=`cat $TEMP_PATH/cpu_info | grep "The average usage of cpu is" |awk -F ":" '{print $2}' `
065
if [ "$cpu_used_pct" -gt "$CPU_VALUE" ] ; then
066
echo "LOG-Warnning:`date +%Y'-'%m'-'%d' '%H':'%M':'%S`, The CPU usage is up to $cpu_used_pct%. Please check the system.">>$LOG_FILE
067
else
068
echo " The CPU load is OK!!">>$LOG_FILE
069
fi
070
071
072
# 內存使用監控,包括交換區的使用情況監控
073
echo >>$LOG_FILE
074
echo >>$LOG_FILE
075
echo "***************************************** check memory useage *****************************************">>$LOG_FILE
076
cat $TEMP_PATH/cpu_info | awk '{print $0;if($1 ~ /^[0-9].*/) (totalpi+=$6)(totalpo+=$7)};
077
END {if(totalpi<10 && totalpo<10) print " The memory usage is OK!!"; if(totalpi>10 || totalpo>10) print "The memory pagein and pageout is to high,Please check the usage of the memory!"} '>>$LOG_FILE
078
079
080
# 檢查磁盤空間.
081
echo >>$LOG_FILE
082
echo >>$LOG_FILE
083
echo "***************************************** check disk space *****************************************">>$LOG_FILE
084
df -k >>$LOG_FILE
085
df -k |grep -v proc |grep -v Filesystem |awk '{x=1*$4}{print $1","$2","$3","$4","$5","$6","$7}'>$TEMP_PATH/disk_info
086
087
cat $TEMP_PATH/disk_info | grep -v '^#' | while read line
088
do
089
item1=$(echo $line | awk -F ',' '{print $1}')
090
item2=$(echo $line | awk -F ',' '{print $2}')
091
item3=$(echo $line | awk -F ',' '{print $3}')
092
item4=$(echo $line | awk -F ',' '{print $4}' |awk -F '%' '{print $1}')
093
item5=$(echo $line | awk -F ',' '{print $5}')
094
item6=$(echo $line | awk -F ',' '{print $6}')
095
item7=$(echo $line | awk -F ',' '{print $7}')
096
if [ "$item4" -gt "$DISK_VALUE" ]; then
097
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`, $item7 is not have enough space ,please check." >>$LOG_FILE
098
else
099
echo " The space of disk $item7 is OK!!" >>$LOG_FILE
100
fi
101
done
102
103
#
104
# 檢查磁盤的io進行監控,iostat
105
#
106
echo >>$LOG_FILE
107
echo >>$LOG_FILE
108
echo "***************************************** check iostat *****************************************">>$LOG_FILE
109
iostat 1 3 >>$LOG_FILE
110
111
# 對網絡流量進行監控
112
echo >>$LOG_FILE
113
echo >>$LOG_FILE
114
echo "***************************************** check netstat *****************************************">>$LOG_FILE
115
netstat -i >>$LOG_FILE
116
117
# Check the oracle background processes .
118
echo >>$LOG_FILE
119
echo >>$LOG_FILE
120
echo "***************************************** check oracle process *****************************************">>$LOG_FILE
121
ps -ef | grep ora_ | grep -v grep | awk -F '-' '{print $2}' | awk '{print $2}' >/$TEMP_PATH/ora_process_info
122
ps -ef | grep ora_ | grep -v grep >>$LOG_FILE
123
124
# background process ckpt
125
if [ `grep ora_ckpt_ora92 $TEMP_PATH/ora_process_info` ]; then
126
COUNT=1
127
else
128
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,The Process ora_ckpt_ora92 was terminated!" >>$LOG_FILE
129
fi
130
131
# background process dbwr
132
if [ `grep ora_dbw0_ora92 $TEMP_PATH/ora_process_info` ]; then
133
COUNT=$((COUNT+1))
134
else
135
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,The Process ora_dbw0_ora92 was terminated !" >>$LOG_FILE
136
fi
137
138
# background process reco
139
if [ `grep ora_reco_ora92 $TEMP_PATH/ora_process_info` ]; then
140
COUNT=$((COUNT+1))
141
else
142
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,The Process ora_reco_ora92 was terminated !" >>$LOG_FILE
143
fi
144
145
# background process lgwr
146
if [ `grep ora_lgwr_ora92 $TEMP_PATH/ora_process_info` ]; then
147
COUNT=$((COUNT+1))
148
else
149
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,The Process ora_lgwr_ora92 was terminated !" >>$LOG_FILE
150
fi
151
152
# background process pmon
153
if [ `grep ora_pmon_ora92 $TEMP_PATH/ora_process_info` ]; then
154
COUNT=$((COUNT+1))
155
else
156
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,The Process ora_pmon_ora92 was terminated !" >>$LOG_FILE
157
fi
158
159
# background process smon
160
if [ `grep ora_smon_ora92 $TEMP_PATH/ora_process_info` ]; then
161
COUNT=$((COUNT+1))
162
else
163
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,The Process ora_smon_ora92 was terminated !" >>$LOG_FILE
164
fi
165
166
if [ "$COUNT" -eq 6 ];then
167
echo >>$LOG_FILE
168
echo " The main six Oracle processes is OK !!" >>$LOG_FILE
169
else
170
:
171
fi
172
173
174
#
175
# Check the oracle tablespace.
176
#
177
echo >>$LOG_FILE
178
echo >>$LOG_FILE
179
echo "***************************************** check oracle tablespace *****************************************">>$LOG_FILE
180
#su - oracle -c sqlplus dxh/dxh < /home/guest/dxhwh/niyl/tablespace_query.sql >>$LOG_FILE
181
sqlplus -s dxh/dxh < $TEMP_PATH/ts_info
182
set pagesize 100
183
set linesize 100
184
col status for a10
185
col tablespace_name for a20
186
col contents for a10
187
col "size(M)" for a15
188
col used for a15
189
col pct for a10
190
select d.status, d.tablespace_name,
191
TO_CHAR(NVL(a.bytes / 1024 /1024, 0),'99G999G990') "size(M)",
192
TO_CHAR(NVL(a.bytes - NVL(f.bytes, 0),0)/1024/1024, '99G999G990D00') used,
193
TO_CHAR(NVL((a.bytes - NVL(f.bytes, 0)) / a.bytes * 100, 0), '990D00')||'%' pct
194
FROM sys.dba_tablespaces d,
195
(select tablespace_name, sum(bytes) bytes from dba_data_files group by tablespace_name) a,
196
(select tablespace_name, sum(bytes) bytes from dba_free_space group by tablespace_name) f
197
WHERE d.tablespace_name = a.tablespace_name(+)
198
AND d.tablespace_name = f.tablespace_name(+)
199
order by tablespace_name ;
200
exit
201
!EOF
202
203
cat $TEMP_PATH/ts_info>>$LOG_FILE
204
cat $TEMP_PATH/ts_info |grep ONLINE |awk '{print $2":"$3":"$4":"$5}' |while read line
205
do
206
ts_name=$(echo $line |awk -F ':' '{print $1}')
207
ts_total=$(echo $line |awk -F ':' '{print $2}')
208
ts_used=$(echo $line |awk -F ':' '{print $3}')
209
ts_used_pct=$(echo $line |awk -F ':' '{print $4}' |awk -F '%' '{print $1}'|awk -F '.' '{print $1}')
210
if [ "$ts_used_pct" -gt "$TS_VALUE" -o "$ts_used_pct" -eq "$TS_VALUE" ]; then
211
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,表空間$ts_name 的剩余空間緊張,請盡快清理表空間!" >>$LOG_FILE
212
else
213
echo " The tablespace of $ts_name is OK!!" >>$LOG_FILE
214
fi
215
done
216
217
#
218
# Check the oracle Job.
219
#
220
echo >>$LOG_FILE
221
echo >>$LOG_FILE
222
echo "***************************************** check oracle job *****************************************">>$LOG_FILE
223
sqlplus -s dxh/dxh <> $LOG_FILE
224
col job for 999
225
col last_date for a20
226
col next_date for a20
227
col what for a40
228
set linesize 120
229
230
select job,what,
231
to_char(last_date,'yyyy-mm-dd hh24:mi:ss') last_date,
232
to_char(next_date,'yyyy-mm-dd hh24:mi:ss') next_date,
233
failures
234
from dba_jobs
235
order by job;
236
!!ET
237
238
sqlplus -s dxh/dxh < $TEMP_PATH/job_info
239
col flag for a5
240
col rou for 99999
241
select 'XXX' flag,job,failures,broken,round(next_date-sysdate,2)*100 rou from dba_jobs order by job;
242
!EOF
243
244
cat $TEMP_PATH/job_info |grep XXX |awk '{print $2,$3,$4,$5}' |while read line
245
do
246
jobnum=`echo $line | awk '{print $1}'`
247
failure=`echo $line | awk '{print $2}'`
248
broken=`echo $line | awk '{print $3}'`
249
round=`echo $line | awk '{print $4}'`
250
if [ "$jobnum" -eq 3 -o "$jobnum" -eq 4 ] ; then
251
252
if [ "$failure" -eq 0 -a "$broken"="N" -a "$round" -le 100 ]; then
253
echo " The Job $jobnum is OK!!" >>$LOG_FILE
254
else
255
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,The Job $jobnum was terminated !" >>$LOG_FILE
256
fi
257
else
258
if [ "$failure" -eq 0 -a "$broken"="N" -a "$round" -eq 0 ]; then
259
echo " The Job $jobnum is OK!!" >>$LOG_FILE
260
else
261
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,The Job $jobnum was terminated !" >>$LOG_FILE
262
fi
263
fi
264
done
265
266
#
267
# Check the oracle session.
268
#
269
sqlplus -s dxh/dxh <> $LOG_FILE
270
select 'The Total sessions number is '||count(*)||'.' from v$session ;
271
select 'table mt: ' ,count(*) from t_dxh_mt where msgresult='SUCCESS';
272
select 'table detect:' ,count(*) from t_dxh_opendetect where msgresult='SUCCESS';
273
exit
274
!
275
276
277
#
278
# Check oracle table for user information sync.
279
#
280
echo >>$LOG_FILE
281
echo >>$LOG_FILE
282
echo "*****************************************oracle 同步數據檢查信息輸出*****************************************">>$LOG_FILE
283
sqlplus -s ccmdxh/ccm@ccmdxh < $TEMP_PATH/jiya_info
284
select 'NUM_P630' flag,count(*) from T_DXH_USERINFO ;
285
select 'NUM_p570' flag,count(*) from T_DXH_USERINFO2 ;
286
!
287
cat $TEMP_PATH/jiya_info >>$LOG_FILE
288
cat $TEMP_PATH/jiya_info |grep NUM_| grep -v COUNT | while read line
289
do
290
zhuji=`echo $line |awk '{print $1}'`
291
user_num=`echo $line |awk '{print $2}'`
292
if [ "$user_num" -le 2000 ]; then
293
echo " The node $zhuji users sync is OK!! " >>$LOG_FILE
294
else
295
echo "LOG-Warnning: The node $zhuji users sync terminated abnormally.Please check !!" >>$LOG_FILE
296
fi
297
done
298
299
#
300
# Check oracle alert log.
301
#
302
echo >>$LOG_FILE
303
echo >>$LOG_FILE
304
echo "***************************************** check oracle alert log *****************************************">>$LOG_FILE
305
tail -300 $ORACLE_BASE/admin/ora92/bdump/alert_ora92.log | grep -v Thread |
306
grep -v Current | grep -v "`date +'%a %h'`" | grep -v ":[0-9][0-9]:" >>$LOG_FILE
307
308
#
309
# Check system error report.
310
#
311
echo >>$LOG_FILE
312
echo >>$LOG_FILE
313
echo "***************************************** check system err *****************************************">>$LOG_FILE
314
errpt | head -10 >>$LOG_FILE
315
day=`date +%D |awk -F "/" '{print $1$2}'`
316
errpt | awk '{print $2}' | grep ^$day
317
if [ $? -eq 0 ] ; then
318
echo "LOG-Warnning: `date +%Y'-'%m'-'%d' '%H':'%M':'%S`,The system has found a error today.Please check the error report." >>$LOG_FILE
319
else
320
echo >>$LOG_FILE
321
echo " There is no system error report today.System is OK!!" >>$LOG_FILE
322
fi
323
324
#
325
# Check HACMP.
326
#
327
echo >>$LOG_FILE
328
echo >>$LOG_FILE
329
echo "***************************************** check HACMP status *****************************************">>$LOG_FILE
330
/usr/es/sbin/cluster/clstat -o > $TEMP_PATH/ha_info
331
lssrc -g cluster >> $TEMP_PATH/ha_info
332
cat $TEMP_PATH/ha_info >>$LOG_FILE
333
echo >>$LOG_FILE
334
cat $TEMP_PATH/ha_info| grep "Node:" |awk -F ':' '{print $2,$3}' | awk '{print $1,$3}' | while read line
335
do
336
node=$(echo $line | awk '{print $1}')"'s"
337
echo $line |grep UP$ >/dev/null
338
if [ "$?" -eq 0 ]; then
339
echo " The node $node is OK!!" >>$LOG_FILE
340
else
341
echo "`date +%Y'-'%m'-'%d' '%H':'%M':'%S`,LOG-Warnning: The node $node status is DOWN ,it was terminated ." >>$LOG_FILE
342
fi
343
done