Script to Collect DB Upgrade/Migrate Diagnostic Information (dbupgdiag.sql)

This script is intended to provide a user friendly output to diagnose the status of the database either before (or) after upgrade. The script will create a file called db_upg_diag__.log.

-- - - - - - - - - - - - - - Script begins here - - - - - - - - - - - - - - 

col TODAY    NEW_VALUE    _DATE     
col VERSION NEW_VALUE _VERSION
set termout off
select to_char(SYSDATE,'fmMonth DD, YYYY') TODAY from DUAL;
select version from v$instance;
set termout on
set echo off
set feedback off
set head off
set verify off
Prompt
PROMPT Enter location for Spooled output:
Prompt
DEFINE log_path = &1
column timecol new_value timestamp
column spool_extension new_value suffix
SELECT to_char(sysdate,'dd-Mon-yyyy_hhmi') timecol,'.log' spool_extension FROM 
sys.dual;
column output new_value dbname
SELECT value || '_' output FROM v$parameter WHERE name = 'db_name';
spool &log_path/db_upg_diag_&&dbname&&timestamp&&suffix
set linesize 150
set pages 100
set trim on
set trims on
col Compatible for a35
col comp_id for a12
col comp_name for a40
col org_version for a11
col prv_version for a11
col owner for a12
col object_name for a40
col object_type for a40
col Wordsize for a25
col Metadata for a8
col 'Initial DB Creation Info' for a35
col 'Total Invalid JAVA objects' for a45
col 'Role' for a30
col 'User Existence' for a27
col "JAVAVM TESTING" for a15
Prompt
Prompt
set feedback off head off
select LPAD('*** Start of LogFile ***',50) from dual;
select LPAD('Oracle Database Upgrade Diagnostic Utility',44)||
 LPAD(TO_CHAR(SYSDATE, 'MM-DD-YYYY HH24:MI:SS'),26) from dual;
Prompt
Prompt ===============
Prompt Database Uptime
Prompt ===============
SELECT to_char(startup_time, 'HH24:MI DD-MON-YY') "Startup Time" 
FROM v$instance;
Prompt
Prompt =================
Prompt Database Wordsize
Prompt =================
SELECT distinct('This is a ' || (length(addr)*4) || '-bit database') "WordSize" 
FROM v$process;
Prompt
Prompt ================
Prompt Software Version
Prompt ================
SELECT * FROM v$version;
Prompt
Prompt =============
Prompt Compatibility
Prompt =============
SELECT 'Compatibility is set as '||value Compatible 
FROM v$parameter WHERE name ='compatible';
Prompt
Prompt ================
Prompt Component Status
Prompt ================
Prompt
SET SERVEROUTPUT ON;
DECLARE

ORG_VERSION varchar2(12);
PRV_VERSION varchar2(12);
P_VERSION VARCHAR2(10);

BEGIN 

SELECT version INTO p_version 
FROM registry$ WHERE cid='CATPROC' ;

IF SUBSTR(p_version,1,5) = '9.2.0' THEN

DBMS_OUTPUT.PUT_LINE(RPAD('Comp ID', 8) ||RPAD('Component',35)|| 
 RPAD('Status',10) ||RPAD('Version', 15));

DBMS_OUTPUT.PUT_LINE(RPAD(' ',8,'-') ||RPAD(' ',35,'-')|| 
 RPAD(' ',10,'-') ||RPAD(' ',15,'-'));

FOR x in (SELECT SUBSTR(dr.comp_id,1,8) comp_id,
 SUBSTR(dr.comp_name,1,35) comp_name, 
 dr.status Status,SUBSTR(dr.version,1,15) version
 FROM dba_registry dr,registry$ r
 WHERE dr.comp_id=r.cid and dr.comp_name=r.cname
 ORDER BY 1)

LOOP

DBMS_OUTPUT.PUT_LINE(RPAD(SUBSTR(x.comp_id,1,8),8) || 
 RPAD(SUBSTR(x.comp_name,1,35),35)||
 RPAD(x.status,10) || RPAD(x.version, 15));
END LOOP;

ELSIF SUBSTR(p_version,1,5) != '9.2.0' THEN

DBMS_OUTPUT.PUT_LINE(RPAD('Comp ID', 8) ||RPAD('Component',35)||  
 RPAD('Status',10) ||RPAD('Version', 15)||
 RPAD('Org_Version',15)||RPAD('Prv_Version',15));

DBMS_OUTPUT.PUT_LINE(RPAD(' ',8,'-') ||RPAD(' ',35,'-')|| 
 RPAD(' ',10,'-')||RPAD(' ',15,'-')||RPAD(' ',15,'-')||
 RPAD(' ',15,'-'));

FOR y in (SELECT SUBSTR(dr.comp_id,1,8) comp_id,
 SUBSTR(dr.comp_name,1,35) comp_name, dr.status Status, 
 SUBSTR(dr.version,1,11) version,org_version,prv_version
 FROM dba_registry dr,registry$ r
 WHERE dr.comp_id=r.cid and dr.comp_name=r.cname
 ORDER BY 1)

LOOP

DBMS_OUTPUT.PUT_LINE(RPAD(substr(y.comp_id,1,8), 8) || 
 RPAD(substr(y.comp_name,1,35),35)||RPAD(y.status,10) ||
 RPAD(y.version, 15)||RPAD(y.org_version,15)||RPAD(y.prv_version,15));

END LOOP;

END IF;
END;
/
SET SERVEROUTPUT OFF
Prompt
Prompt
Prompt ======================================================
Prompt List of Invalid Database Objects Owned by SYS / SYSTEM
Prompt ======================================================
Prompt
set head on
SELECT case count(object_name)
WHEN 0 THEN 'There are no Invalid Objects'
ELSE 'There are '||count(object_name)||' Invalid objects'
END "Number of Invalid Objects"
FROM dba_objects 
WHERE status='INVALID'
AND owner in ('SYS','SYSTEM');
Prompt
DOC 
################################################################

 If there are no Invalid objects below will result in zero rows.

################################################################
#
Prompt
set feedback on
SELECT owner,object_name,object_type 
FROM dba_objects 
WHERE status='INVALID' 
AND owner in ('SYS','SYSTEM')
ORDER BY owner,object_type;
set feedback off
Prompt
Prompt ================================
Prompt List of Invalid Database Objects
Prompt ================================
Prompt
set head on
SELECT case count(object_name)
WHEN 0 THEN 'There are no Invalid Objects'
ELSE 'There are '||count(object_name)||' Invalid objects'
END "Number of Invalid Objects"
FROM dba_objects 
WHERE status='INVALID'
AND owner not in ('SYS','SYSTEM');
Prompt
DOC
################################################################

 If there are no Invalid objects below will result in zero rows.

################################################################
#
Prompt
set feedback on
SELECT owner,object_name,object_type 
FROM dba_objects 
WHERE status='INVALID' 
AND owner not in ('SYS','SYSTEM')
ORDER BY owner,object_type;
set feedback off
Prompt
Prompt ==============================================================
Prompt Identifying whether a database was created as 32-bit or 64-bit
Prompt ==============================================================
Prompt
DOC 
###########################################################################

 Result referencing the string 'B023' ==> Database was created as 32-bit
 Result referencing the string 'B047' ==> Database was created as 64-bit
 When String results in 'B023' and when upgrading database to 10.2.0.3.0 
 (64-bit) , For known issue refer below articles

 Note 412271.1 ORA-600 [22635] and ORA-600 [KOKEIIX1] Reported While 
 Upgrading Or Patching Databases To 10.2.0.3
 Note 579523.1 ORA-600 [22635], ORA-600 [KOKEIIX1], ORA-7445 [KOPESIZ] and 
 OCI-21500 [KOXSIHREAD1] Reported While Upgrading To 11.1.0.6

###########################################################################
#
Prompt
SELECT SUBSTR(metadata,109,4) "Metadata",
CASE SUBSTR(metadata,109,4)
WHEN 'B023' THEN 'Database was created as 32-bit'
WHEN 'B047' THEN 'Database was created as 64-bit'
ELSE 'Metadata not Matching'
END "Initial DB Creation Info"
FROM sys.kopm$;
Prompt
Prompt ===================================================
Prompt Number of Duplicate Objects Owned by SYS and SYSTEM
Prompt ===================================================
Prompt
Prompt Counting duplicate objects ....
Prompt
SELECT count(1) 
FROM dba_objects 
WHERE object_name||object_type in 
 (SELECT object_name||object_type  
 from dba_objects 
 where owner = 'SYS') 
and owner = 'SYSTEM'; 
Prompt
Prompt =========================================
Prompt Duplicate Objects Owned by SYS and SYSTEM
Prompt =========================================
Prompt
Prompt Querying duplicate objects ....
Prompt
SELECT object_name, object_type 
FROM dba_objects 
WHERE object_name||object_type in 
 (SELECT object_name||object_type  
 FROM dba_objects 
 WHERE owner = 'SYS') 
AND owner = 'SYSTEM'; 
Prompt
DOC

################################################################################

 If any objects found please follow below article.
 Note 1030426.6 How to Clean Up Duplicate Objects Owned by SYS and SYSTEM schema
 Read the Exceptions carefully before taking actions.

################################################################################
#
Prompt
Prompt ================
Prompt JVM Verification
Prompt ================
Prompt
SET SERVEROUTPUT ON
DECLARE

V_CT NUMBER;
P_VERSION VARCHAR2(10);

BEGIN

-- If so, get the version of the JAVAM component
EXECUTE IMMEDIATE 'SELECT version FROM registry$ WHERE cid=''JAVAVM'' 
 AND status <> 99' INTO p_version;

SELECT count(*) INTO v_ct FROM dba_objects
WHERE object_type LIKE '%JAVA%' AND owner='SYS';

IF SUBSTR(p_version,1,5) = '8.1.7' THEN
 IF v_ct>=6787 THEN
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Installed properly');
 ELSE
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Not Installed properly');
 END IF;
ELSIF SUBSTR(p_version,1,5) = '9.0.1' THEN
 IF v_ct>=8585 THEN
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Installed properly');
 ELSE
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Not Installed properly');
 END IF;
ELSIF SUBSTR(p_version,1,5) = '9.2.0' THEN
 IF v_ct>=8585 THEN
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Installed properly');
 ELSE
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Not Installed properly');
 END IF;
ELSIF SUBSTR(p_version,1,6) = '10.1.0' THEN
 IF v_ct>=13866 THEN
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Installed properly');
 ELSE
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Not Installed properly');
 END IF;
ELSIF SUBSTR(p_version,1,6) = '10.2.0' THEN
 IF v_ct>=14113 THEN
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Installed properly');
 ELSE
 DBMS_OUTPUT.PUT_LINE('JAVAVM - Not Installed properly');
 END IF;
END IF;

EXCEPTION WHEN NO_DATA_FOUND THEN
 DBMS_OUTPUT.PUT_LINE('JAVAVM - NOT Installed. Below results can be ignored');

END;
/
SET SERVEROUTPUT OFF
Prompt
Prompt ================================================
Prompt Checking Existence of Java-Based Users and Roles
Prompt ================================================
Prompt
DOC

################################################################################

 There should not be any Java Based users for database version 9.0.1 and above.
 If any users found, it is faulty JVM.

################################################################################
#

Prompt
SELECT CASE count(username)
WHEN 0 THEN 'No Java Based Users'
ELSE 'There are '||count(*)||' JAVA based users'
END "User Existence"
FROM dba_users WHERE username LIKE '%AURORA%' AND username LIKE '%OSE%';
Prompt
DOC

###############################################################

 Healthy JVM Should contain Six Roles. 
 If there are more or less than six role, JVM is inconsistent.

###############################################################
#

Prompt
SELECT CASE count(role)
WHEN 0 THEN 'No JAVA related Roles'
ELSE 'There are '||count(role)||' JAVA related roles'
END "Role"
FROM dba_roles 
WHERE role LIKE '%JAVA%';
Prompt
Prompt Roles
Prompt
SELECT role FROM dba_roles WHERE role LIKE '%JAVA%';
set head off
Prompt
Prompt =========================================
Prompt List of Invalid Java Objects owned by SYS
Prompt =========================================
SELECT CASE count(*) 
 WHEN 0 THEN 'There are no SYS owned invalid JAVA objects'
 ELSE 'There are '||count(*)||' SYS owned invalid JAVA objects'
 END "Total Invalid JAVA objects"
FROM dba_objects 
WHERE object_type LIKE '%JAVA%' 
AND status='INVALID' 
AND owner='SYS';
Prompt
DOC

#################################################################

 Check the status of the main JVM interface packages DBMS_JAVA 
 and INITJVMAUX and make sure it is VALID.
 If there are no Invalid objects below will result in zero rows.

#################################################################
#
Prompt
set feedback on
SELECT owner,object_name,object_type
FROM dba_objects 
WHERE object_type LIKE '%JAVA%' 
AND status='INVALID' 
AND owner='SYS';
set feedback off
Prompt
Prompt INFO: Below query should succeed with 'foo' as result.
set heading on
select dbms_java.longname('foo') "JAVAVM TESTING" from dual;
set heading off
Prompt 

set feedback off head off
select LPAD('*** End of LogFile ***',50) from dual;
set feedback on head on
Prompt
spool off
Prompt
set heading off
set heading off
set feedback off
select 'Upload db_upg_diag_&&dbname&&timestamp&&suffix from "&log_path" directory' 
from dual;
set heading on
set feedback on
Prompt
-- - - - - - - - - - - - - - - - Script ends here - - - - - - - - - - - - - -

Oracle数据库升级前必要的准备工作

Oracle数据库升级向来是一门纷繁复杂的工程,DBA需要为产品数据库的升级耗费大量时间精力在准备工作上;因为其升级复杂度高,所以即便做了较为充分的准备仍可能在升级过程中遇到意想不到的问题,为了更高效地完成升级任务和减少停机时间,我们有必要为升级工作营造一种”舒适的”防御式的数据库”氛围”:

1.为了保障升级后的数据库性能,我们有必要在升级前有效地收集数据库的性能统计信息,以便升级后若发生性能问题可以做出对比:

  • 为了保证性能统计信息真实有效,有必要在数据库升级前的一个月即开展收集工作
  • 收集的性能统计信息应当尽可能的精确真实
  • 在Oracle 8i/9i中使用Statspack性能报表,将快照级别设置为6或更高,设置快照间隔为30分钟,在具体升级前将perfstat用户使用exp工具导出,参考Metalink文档Note:466350.1介绍了若何对比升级前后的Statspack快照
  • 在Oracle 10g/11g中使用AWR自动负载仓库性能报告,保证采集30天左右的快照,快照间隔最好为30-60分钟;之后可以使用dbms_swrf_internal.awr_extract存储过程将AWR导出到dumpfile文件,在升级完成后载入这部分AWR信息,并可以使用DBMS_WORKLOAD_REPOSITORY.AWR_DIFF_REPORT_HTML函数对比升级前后的性能

2.正式升级前的防御性措施:

  • 过多的审计信息可能会导致升级速度下降,可以在升级前将审计数据导出,并清理审计字典基表:
截断SYS.AUD$基表:
SQL>TRUNCATE TABLE SYS.AUD$;
  • 同样的有必要清理10g后出现的回收站:
清理DBA回收站:
SQL>purge DBA_RECYCLEBIN;
  • 移除一些”过期”的参数,设置这些参数的原因很有可能是为了修正原版本上的一些问题,例如我们都会做的设置event参数;但在新版本中这些参数是否仍有必要设置是一个值得讨论的问题,当然你完全可以就此事去提交一个SR:
这些"过期"参数可能包括:过老的如optimizer_features_enable=8.1.7.4,_always_semi_join=off,_unnest_subquery=false
或者event = "10061 trace name context forever, level 10",如此之类等等。
  • 为数据库中的数据字典收集统计信息:
在Oracle 9i中可以执行以下过程收集数据字典统计信息,
SQL> exec DBMS_STATS.GATHER_SCHEMA_STATS
     ('SYS', options => 'GATHER',estimate_percent =>
      DBMS_STATS.AUTO_SAMPLE_SIZE, method_opt => 'FOR
      ALL COLUMNS SIZE AUTO', cascade => TRUE);

在Oracle10g/11g中收集字典统计信息可以由GATHER_DICTIONARY_STATS存储过程来完成:
SQL> exec DBMS_STATS.GATHER_DICTIONARY_STATS;
  • 为策万全,我们有必要为回退数据库升级任务做好准备,10g以前只能通过备份恢复来完成,10g以后我们可以利用闪回数据库的还原点特性来回退数据库,但需要注意以下几点:
    • 利用还原点要求数据库处于归档且打开flashback database的模式下
    • 在特性仅在版本10.2之后可用
    • 必须保证闪回回复区flashback recovery area有足够的磁盘空间
    • 注意在升级后不要立即修改compatible参数,restore point无法跨越compatible工作
/* 首先我们在正式升级前创建一个有效的保证闪回数据库的还原点 */

SQL> create restore point pre11gupgrd guarantee flashback database;
Restore point created.

/* 确认以上4个注意后,我们可以大胆放心地实施升级工作了 */
SQL> shutdown immediate;
..............
SQL> @?/rdbms/admin/catupgrd.sql
.............
upgrade failed

/* 在升级过程中出现了不可绕过的错误时,我们可能不得不回退数据库到还原点,也就是升级前*/

/* 关闭实例后,还原环境到10g下 */

SQL> startup mount;

/* 正式闪回到还原点pre11gupgrd */
SQL> flashback database to restore point pre11gupgrd;
Flashback complete.

SQL> alter database open;
alter database open
*
ERROR at line 1:
ORA-01589: must use RESETLOGS or NORESETLOGS option for database open

SQL>  alter database open resetlogs;

/* 以resetlogs打开数据库 */

/* 之后有必要删除这一个还原点 */
SQL> select * from v$restore_point;

       SCN DATABASE_INCARNATION# GUA STORAGE_SIZE
---------- --------------------- --- ------------
TIME
---------------------------------------------------------------------------
NAME
--------------------------------------------------------------------------------
   5081633                     3 YES     15941632
08-FEB-11 08.20.33.000000000 PM
PRE11GUPGRD

SQL> drop restore point pre11gupgrd;
Restore point dropped.
  • 下载最新版本的预升级检查脚本(pre-upgrade check script),如utlu102i.sql / utlu111i.sql / utlu112i.sql;Metalink文档Note:884522.1 <How to Download and Run Oracle’s Database Pre-Upgrade Utility> 指出了各版本utluxxx脚本的下载地址
/* 将升级信息spool到日志文件中 */
SQL> SPOOL /tmp/UPGRADE/utlu112i.log
SQL> @/tmp/UPGRADE/utlu112i.sql
  • 需要关注SYS和SYSTEM用户模式下的失效对象,有必要在升级前修复所有的失效对象:
SELECT UNIQUE object_name, object_type, owner
  FROM dba_objects
 WHERE status = 'INVALID';
  • 在升级完成后推荐执行utlrp.sql脚本以重新编译(Recompile)对象,从11.1.0.7开始升级前后的失效对象将自动对比,执行?/rdbms/admin/utluiobj.sql脚本可以列出对比信息,同时基表registry$sys_inv_objs和registry$nonsys_inv_objs分别列出了数据库中失效的sys或非sys对象:
SQL> select * from v$version;

BANNER
--------------------------------------------------------------------------------
Oracle Database 11g Enterprise Edition Release 11.2.0.1.0 - 64bit Production
PL/SQL Release 11.2.0.1.0 - Production
CORE    11.2.0.1.0      Production
TNS for Linux: Version 11.2.0.1.0 - Production
NLSRTL Version 11.2.0.1.0 - Production

SQL> @?/rdbms/admin/utluiobj.sql
.
Oracle Database 11.1 Post-Upgrade Invalid Objects Tool 02-08-2011 22:23:22
.
This tool lists post-upgrade invalid objects that were not invalid
prior to upgrade (it ignores pre-existing pre-upgrade invalid objects).
.
Owner                     Object Name                     Object Type
.
SH            FWEEK_PSCAT_SALES_MV               MATERIALIZED VIEW

PL/SQL procedure successfully completed.

3.解决升级过程中失效的组件(component)

  • 确保该部分组件确实被link到目前的Oracle软件2进制可执行文件或库文件中
  • 如果确认不会用到某些组件(component),想要通过手动彻底移除这部分组件(亦或者希望reinstall重新安装这部分组件),那么可以参考以下文档:
Note:472937.1 Information On Installed Database Components/Schemas
Note.300056.1 Debug and Validate Invalid Objects
Note:753041.1 How to diagnose Components with NON VALID status
Note.733667.1 How to Determine if XDB is Being Used in the Database?

组件升级失败实例1:数据库从10.2升级到11.2,在10g的环境中Database Vault组件已经安装,
Database Vault组件在升级relink前被turned off,在升级到11.2的过程中XDB组件升级失败;
其原因在于安装或切换Database Vault将使得XDB组件失效,或者由Bug 8942758引起。
解决方案是在升级前执行utlrp.sql脚本重新编译失效对象和组件,在此例中执行utlrp.sql可以使XDB组件valid.

组件升级失败实例2:数据库从10.2.0.4升级到11.1.0.7,在升级过程中"ORACLE SERVER"组件失效;
其原因在于DMBS_SQLPA包引用了某个不存在的列,该问题可以参考metalink文档782735.1和Notes:605317.1/736353.1。
有效的解决方案是:
1.在升级前将SYS.PLAN_TABLE$基表或者同义词PUBLIC.PLAN_TABLE DROP掉
2.若已执行了升级操作并遭遇了该问题,那么可以使用以下手段修复该问题:
@catplan.sql -- recreate the plan table
@dbmsxpln.sql -- reload dbms_xplan spec
@prvtxpln.plb -- reload dbms_xplan implementation
@prvtspao.plb -- reload dbms_sqlpa
alter package SYS.DBMS_SUMADVISOR compile ;
alter package SYS.DBMS_SUMADVISOR compile body;

4. 使用例如AIX上的slibclean等命令清理操作系统环境,在少数专有平台上不清理载入的共享库文件可能导致升级失败

5.在执行catupgrd.sql脚本正式升级前打开sqlplus的echo输出,将升级过程中所有的输出信息转储到日志文件中:

SQL> set echo on

SQL> SPOOL /tmp/upgrade.log

SQL> @catupgrd.sql
SQL> spool off

DBUA图形化升级工具默认使用spool和”echo”输出,这些日志可以在$ORACLE_HOME/cfgtoollogs/dbua//upgrade/目录下找到。

January 2011 Patch set Update发布

January 2011补丁集更新在大约2周前发布了,与Oracle Database相关的psu分别为10.2.0.5.2,10.2.0.4.7(REQUIRES PRE-REQUISITE 10.2.0.4.4),11.2.0.1.4,11.2.0.2.1.

在OEL5上安装配置Oracle Gird Control 10.2.0.5

早期的Grid Control问题实在太多了,以至于把10.2.0.1的Grid Control升级到10.2.0.5几乎是不可能完成的任务;此外10.2.0.5以前的gc不支持11g作为repository database仓库数据库,不仅于此10.2.0.1版本是不支持rhel5或OEL5的,如果想安装的话rhel/oel 4是仅有理想的平台。这就这为我们制造了许多局限。如果是Fresh Installation的话似乎只安装软件(software only),而不在安装10.2.0.1阶段配置oms,在升级到10.2.0.5后再进行oms的config会是一种比较理想的安装方法。不过我们仍需要应付一个复杂的配置过程,写这个文档的目的是帮助我们应付(包括已安装过的人,因为很容易忘记)这种窘况。

1.我们需要一个已经存在的Oracle数据库,当然它应当是纯净的(没有相关的em repository),其版本最好是10.2.0.5或者11.2.0.2;我们假设你的数据库已经满足了一切安装grid control的前提要求,这包括设置几个初始化参数和装有dbms_shared_pool包等等。

2.其次你需要下载对应的软件,这包括了完全版本的10.2.0.1安装介质和10.2.0.5 gc patchset,并且最好有10.2.0.5版的agent(个人经验agent不太稳定,有时候需要重配,所以有介质的话会方便些):

  • Linux_Grid_Control_full_102010_disk1.zip
  • Linux_Grid_Control_full_102010_disk2.zip
  • Linux_Grid_Control_full_102010_disk3.zip
  • gc_x86_64_10205_part1of2.zip
  • gc_x86_64_10205_part2of2.zip
  • Linux_x86_64_Grid_Control_agent_download_10_2_0_5_0.zip

要下载这一大堆介质可能是我们安装过程中最麻烦的一件事情,特别是当你的网络状况欠佳的季节。
因为都是zip包,所以你只需要使用unzip命令将以上介质一一解压到合适目录就可以了。

3.配置OMS所在主机的内核参数和rpm包,下面给出了相关配置文件的示例值:

/etc/sysctl.conf:
kernel.shmall = 2097152
kernel.shmmax = 536870912
kernel.shmmni = 4096
kernel.sem = 250 32000 100 128
fs.file-max = 65536
# semaphores: semmsl, semmns, semopm, semmni
net.ipv4.ip_local_port_range = 1024 65000
net.core.rmem_default = 262144
net.core.rmem_max = 262144
net.core.wmem_default = 262144
net.core.wmem_max = 262144

/* 注意以上参数并不一定适合你的主机,具体如何配置请参见Metalink文档 */

同时修改/etc/security/limits.conf参数文件:
*               soft    nproc   2047
*               hard    nproc   16384
*               soft    nofile  1024
*               hard    nofile  65536

/* 星号换上你的安装用户名,如oracle或者其他dba组成员 */

安装适当的rpm包,安装database时需要的包一律也都需要,此外请特别留意安装一下几个包:
compat-libstdc++-296-2.96-138.i386
libstdc++-devel-4.1.2-48.el5.x86_64
libstdc++-devel-4.1.2-48.el5.i386
glibc-devel-2.5-49.x86_64
glibc-devel-2.5-49.i386

并建立下列符号连接:
ln -s /usr/lib/libgdbm.so.2.0.0 /usr/lib/libdb.so.2

4.上述工作完成后我们需要修改response文件以满足安装的需要,在10.2.0.1安装介质的解压目录下操作:

[root@nas media]# ls
dcommon  doc  index.htm  install  libskgxn  oms  rdbms  response  runInstaller  stage

[root@nas media]# vi response/em_using_existing_db.rsp 

/* 修改em_using_existing_db.rsp响应文件 */

包括以下参数需要从默认值修改为指定值:
UNIX_GROUP_NAME="dba"
#dba应当是有效的安装用户所在组

FROM_LOCATION="/s01/media/oms/Disk1/stage/products.xml"
#FROM LOCATION指向安装介质stage目录下的products.xml文件

BASEDIR="/s01/app/gc"
#BASEDIR指向grid control安装的基础目录

INSTALLATION_NAME="oms10g"
#安装名

s_reposHost="rh3.oracle.com"
#repository数据库的主机名或ip地址

s_reposPort="1521"
#repository数据库的监听端口

s_reposSID="EMREP"
#repository数据库的sid

s_reposDBAPwd="maclean"
#repository数据库的sys用户密码

s_mgmtTbsName="/s01/orabase/oradata/EMREP/mgmt.dbf"
#repository数据库今后的mgmt表空间的数据文件名

s_ecmTbsName=s_mgmtTbsName="/s01/orabase/oradata/EMREP/mgmt_ecm.dbf"
#repository数据库今后的ecm表空间的数据文件名

s_securePassword="maclean"
#agent将来使用的secure密码

s_securePasswordConfirm="maclean"
#确认上一步的密码

b_lockedSelected=false
#确定agent交互是否被锁

s_reposPwd="maclean"
#确定仓库数据库中模式拥有者(sysman)的密码

s_reposPwdConfirm="maclean"
#确认上一步的密码

5.以静默方式安装grid control 10.2.0.1,但不配置oms:

[maclean@nas ~]$ export  TMP=/tmp

[maclean@nas ~]$  /s01/media/install/runInstaller -noconfig -silent -ignoreSysPrereqs -responseFile \
/s01/media/response/em_using_existing_db.rsp  use_prereq_checker=false b_skipDBValidation=true -force

以上安装完成后,运行相关脚本:
[maclean@nas ~]$ su - root -c "/home/maclean/oraInventory/orainstRoot.sh"

[maclean@nas ~]$ su - root -c "/s01/app/gc/oms10g/allroot.sh"

并使用opmonctl命令关闭http等服务:
[maclean@nas ~]$ /s01/app/gc/oms10g/opmn/bin/opmnctl stopall
opmnctl: stopping opmn and all managed processes...

5.接下来我们需要安装grid control 10.2.0.5补丁集,同样的需要修改一个response响应文件:

[maclean@nas 10205]$ unzip /tmp/gc_x86_64_10205_part2of2.zip
Archive:  /tmp/gc_x86_64_10205_part2of2.zip
extracting: p3731593_10205_Linux-x86-64.zip

[maclean@nas 10205]$ unzip p3731593_10205_Linux-x86-64.zip
..............

[maclean@nas ~]$ cp /s01/10205/3731593/Disk1/response/patchset.rsp /s01/10205/3731593/Disk1/response/oms_patchset.rsp

/* 对oms_patchset.rsp修改已有的参数为指定值 */

[maclean@nas ~]$ vi /s01/10205/3731593/Disk1/response/oms_patchset.rsp

ORACLE_HOME="/s01/app/gc/oms10g"
b_softwareonly=true
s_sysPassword="maclean"
sl_pwdInfo={ "maclean" }
oracle.iappserver.st_midtier:szl_InstanceInformation={ "maclean" }

ORACLE_HOME_NAME="oms10g"
#另外增加以上条目

[maclean@nas ~]$ /s01/10205/3731593/Disk1/runInstaller -noconfig -silent \
-responseFile /s01/10205/3731593/Disk1/response/oms_patchset.rsp

/* 以上10.2.0.5补丁安装完成后,同样需要执行root.sh */

[maclean@nas ~]$ su - root -c "/s01/app/gc/oms10g/root.sh"

6.完成上述安装后可以开始配置OMS了:

[maclean@nas ~]$ export PERL5LIB=/s01/app/gc/oms10g/perl/lib/5.6.1

[maclean@nas ~]$ /s01/app/gc/oms10g/perl/bin/perl /s01/app/gc/oms10g/sysman/install/ConfigureGC.pl \
/s01/app/gc
Base Directory: /s01/app/gc

 Starting ito execute Configuration Assistants: 

Running the configuration assistants using the following command:
/s01/app/gc/oms10g/oui/bin/runConfig.sh INV_PTR_LOC=/s01/app/gc/oms10g/oraInst.loc
ORACLE_HOME=/s01/app/gc/oms10g ACTION=configure MODE=perform
COMPONENT_XML={encap_oms.1_0_0_0_0.xml}
perform - mode is starting for action: configure

以上命令的具体形式为:
<OMS ORACLE_HOME>/perl/bin/perl <OMS ORACLE_HOME>/sysman/install/ConfigureGC.pl <Parent Directory filepath> 

配置会消耗大量的时间,建议通过.../oms10g/cfgtoollogs/oui/configActions<>.log.日志文件监控配置过程:

[root@nas oui]# cd /s01/app/gc/oms10g/cfgtoollogs/oui

[root@nas oui]# tail -f configActions2011-01-23_08-57-20-AM.log
... return status = 0 (success)
Oracle JAAS [Sun Jan 23 08:57:43 CST 2011]  $ORACLE_HOME/j2ee/home/config/jazn-data.xml is synchronized successfully to dcm repository.
Please check the log file [/s01/app/gc/oms10g/cfgtoollogs/jaznca.log] for details.

The plug-in Java Security Configuration Assistant has successfully been performed
------------------------------------------------------
------------------------------------------------------
The plug-in Web Cache Configuration Assistant is running

2
Start traversing...
got process-manager node
got ias-instance node
attrValue=IASPT

attrValue=DSA

attrValue=HTTP_Server

attrValue=LogLoader

attrValue=dcm-daemon

attrValue=OC4J

attrValue=WebCache

Entity found.

got ias-instance node
Current status is : enabled
Changing the value of port to enabled
 Modified ...
Before After WaitForComplete
After WaitForComplete
Completed smiSetStatus
Checking status ... enableconfiguration
In ... getWebcachePort
WebCache Default Port :7777
In ... updateApacheConf
Apache Port Value : 7777
Apache Before WaitForComplete
Apache After WaitForComplete
Checking for Apache updation status
Apache httpd.conf updated
smiTearDown
Will be checking the status ...
Webcache Configuration finished successfully

The plug-in Web Cache Configuration Assistant has successfully been performed
------------------------------------------------------
------------------------------------------------------
The plug-in Oracle Application Server Instance Configuration Assistant is running

The plug-in Oracle Application Server Instance Configuration Assistant has successfully been performed
------------------------------------------------------
------------------------------------------------------
The plug-in OC4J Instance Configuration Assistant is running

Reading ini file - '/s01/app/gc/oms10g/j2ee/deploy.ini'
Adding web-app 'IsWebCacheWorkingWeb.war' for app 'IsWebCacheWorking'.
Adding web-app 'wsrp-samples.war' for app 'portletapp'.
Initializing DCM...done.
OC4J instance 'home' already exists.
Starting OC4J instance 'home'...done.
Deploying application 'portletapp' to OC4J instance 'home'.
Notification ==> Application Deployer for portletapp STARTS [ 2011-01-23T08:58:00.972CST ]
Notification ==> Undeploy previous deployment
Notification ==> Removing files for app file:/s01/app/gc/oms10g/j2ee/home/applications/portletapp.ear
Notification ==> Copy the archive to /s01/app/gc/oms10g/j2ee/home/applications/portletapp.ear
Notification ==> Unpack portletapp.ear begins...
Notification ==> Unpack portletapp.ear ends...
Notification ==> Initialize portletapp.ear begins...
Notification ==> Initialize portletapp.ear ends...
Notification ==> Initialize wsrp-samples begins...
Notification ==> Initialize wsrp-samples ends...
Notification ==> deleting:  /s01/app/gc/oms10g/j2ee/home/applications/portletapp.ear
Notification ==> deleting:  /s01/app/gc/oms10g/j2ee/home/applications/portletapp/wsrp-samples.war
Notification ==> Application Deployer for portletapp COMPLETES [ 2011-01-23T08:58:01.319CST ] 

Deploying application 'IsWebCacheWorking' to OC4J instance 'home'.
Notification ==> Application Deployer for IsWebCacheWorking STARTS [ 2011-01-23T08:58:01.328CST ]
Notification ==> Undeploy previous deployment
Notification ==> Removing files for app file:/s01/app/gc/oms10g/j2ee/home/applications/IsWebCacheWorking.ear
Notification ==> Copy the archive to /s01/app/gc/oms10g/j2ee/home/applications/IsWebCacheWorking.ear
Notification ==> Unpack IsWebCacheWorking.ear begins...
Notification ==> Unpack IsWebCacheWorking.ear ends...
Notification ==> Initialize IsWebCacheWorking.ear begins...
Notification ==> Initialize IsWebCacheWorking.ear ends...
Notification ==> Initialize IsWebCacheWorkingWeb begins...
Notification ==> Initialize IsWebCacheWorkingWeb ends...
Notification ==> deleting:  /s01/app/gc/oms10g/j2ee/home/applications/IsWebCacheWorking.ear
Notification ==> deleting:  /s01/app/gc/oms10g/j2ee/home/applications/IsWebCacheWorking/IsWebCacheWorkingWeb.war
Notification ==> Application Deployer for IsWebCacheWorking COMPLETES [ 2011-01-23T08:58:01.362CST ] 

Calling updateConfig to notify DCM of new deployments...done.
Adding dependent libraries for application 'portletapp'...done.
Adding OC4J mount points for application 'portletapp'...done.
Adding OC4J mount points for application 'IsWebCacheWorking'...done.
Calling SMI to save changes.
SMISession.saveChanges succeeded.
Binding web app 'wsrp-samples' to default-web-site for application 'portletapp' in OC4J instance 'home'
Web app 'wsrp-samples' bound successfully.
Binding web app 'IsWebCacheWorkingWeb' to default-web-site for application 'IsWebCacheWorking' in OC4J instance 'home'
Web app 'IsWebCacheWorkingWeb' bound successfully.
Calling updateConfig to notify DCM of new web-bindings...done.
Adding application 'portletapp' to the DCM repository...done.
Application 'portletapp' deployed successfully.
Adding application 'IsWebCacheWorking' to the DCM repository...done.
Application 'IsWebCacheWorking' deployed successfully.
Stopping OC4J instance 'home'...done.
Calling SMI to retry init of failed plugins...done.
Terminating DCM...done.
Copying /s01/app/gc/oms10g/j2ee/deploy.ini to /s01/app/gc/oms10g/j2ee/deploy.ini.1295744298019.bak.
Writing any undeployed entries back to /s01/app/gc/oms10g/j2ee/deploy.ini.

Oc4jDeploy tool completed successfully!

The plug-in OC4J Instance Configuration Assistant has successfully been performed
------------------------------------------------------
------------------------------------------------------
The plug-in Register DCM Plug-Ins With EM is running

Operation successful.

The plug-in Register DCM Plug-Ins With EM has successfully been performed
------------------------------------------------------
------------------------------------------------------
The plug-in DCM Repository Backup Assistant is running

backup created: InstalledImage_EnterpriseManager0.nas

The plug-in DCM Repository Backup Assistant has successfully been performed
------------------------------------------------------
------------------------------------------------------
The plug-in EM Technology Stack Upgrade is running

The plug-in EM Technology Stack Upgrade has successfully been performed
------------------------------------------------------
------------------------------------------------------
The plug-in OMS Configuration is running

Operation Stopping OPMN Processes is in progress.
Operation EM Deploying is in progress.
Operation Creating OMS Respository is in progress.
Operation Configuring OMS is in progress.
OMS is being Secured and Lock is set to false.
Precompiling JSPs.
Performing installation of CLI services for client.
Operation Restarting OPMN Processes is in progress.

The plug-in OMS Configuration has successfully been performed
------------------------------------------------------
------------------------------------------------------
The plug-in Agent Configuration Assistant is running

Performing free port detection on host=nas
Securing the agent
Performing targets discovery and agent configuration

The plug-in Agent Configuration Assistant has failed its perform method
------------------------------------------------------
The action configuration has failed its perform method
###################################################

7.如果以上oms configuration顺利完成那么Grid Control的网页界面已经可以登录了,但我们在本地服务器上的agent仍是10.2.0.1版本的(所以Agent Configuration Assistant失败了),这里我推荐将原agent目录删除后通过10.2.0.5的agent介质(Linux_x86_64_Grid_Control_agent_download_10_2_0_5_0.zip)重新安装并部署,这样可以很大程度上避免出现问题。

Advice on upgrading to 11.2.0.2 and converting to RAC

Question:Problem Description: Currently we are running 11.2.0.1 Grid on a 2 nodes. We are facing DB crash issue and different bugs with current environment. One of the option Oracle provided is to upgrade Grid to 11.2.0.2 and convert to real RAC.

There are 11.2.0.1 database under the clusterware. These DBs are currently actively running on one node and cold failover HA enabled by Oracle’s perl scripts. We also have 10g standalone DBs running under the same clusterware.

We are planning the upgrade of Grid from 11.2.0.1 to 11.2.0.2, and also converting the cold faliover HA DBs to the real RAC 2 nodes or RAC 1 node.

We will need Oracle’s advice on the technical road map and procedures on this. Also the issues we might face. The upgrade is planned to start in one week from now. We really appreciate that Oracle get back to us as quickly as possible. Thanks.

Answer:
Please follow the below document for manual upgrade or using dbua.
Complete Checklist for Manual Upgrades to 11gR2 (Doc ID 837570.1)
Complete checklist to upgrade the database to 11g R2 using DBUA (Doc ID 870814.1)
Please follow the document to convert the single instance to RAC.
How to Convert 10g Single-Instance database to 10g RAC using Manual Conversion procedure (Doc ID 747457.1)

1. Our current version of Grid/clusterware is 11.2.0.1. Two versions of RDBMS are running currently on the cluster: 11.2.0.1 and 10.2.0.5. They are all on ASMs. +ASM1 is running on node 1 while +ASM2 is running on node 2. Some DBs are Standalone DBs and some DBs are Oracle active-passive HA with failover controlled by the Oracle perl scripts.
We are planning to upgrade Grid to 11.2.0.2 and converting Oracle HA to the real RAC 2 node or RAC 1 node. At the sametime, we are planning to move other system’s DBs onto the same cluster. The current DB versions for that system have 10.2.0.4,10.2.0.1, 9.2.0.7, 9.2.0.5.

Ans:

Oracle 9i Databases were not managed by Oracle Clusterware 11.1 already.The Clusterware has exceeded the limitation. So kindly request to upgrade the 9i database ASAP to manage these databases on 11gR2 clusterware.

2. Does Oracle have any suggestion on the order of upgrade to 11.2.0.2 and the converting to real RAC from Oracle HA under clusterware?

Ans:

There is no documentation. I assume that you have installed clusterware,ASM and RDBMS has been installed on active-passive(Oracle HA) nodes. You need to disable the pearl script and bring up the clusterware ,ASM and RDBMS instance on passive node.

3. In note 1212703.1 Oracle provided, patch 9974223 is mentioned if multicast is enabled only for 224.0.0.251 address. I am confused on the instructions in the readme of patch 9974223. It mentions to apply both on CRS_HOME and RDBMS HOME.

Ans:

If we upgrade RDBMS to 11.2.0.2 as well becides the Grid 11.2.0.2 upgrade, is it required to apply the patch to the RDBMS HOME too, or just optional? If it is required, does the patch only need to be applied in the RDBMS, which has RAC DB running? Does it need to apply on the Standalone DB on the Grid/ASM?

The patch 9974223 is applicable for CRS/RDBMS. You can apply this patch on both home. Yes. Please apply the patch on RDBMS home also. You no need to apply for standalone DB on the Grid/ASM.

1. Per Infrastructure Redundant Interconnect and ora.cluster_interconnect.haip [ID 1210883.1 ], Redundant Interconnect without any 3rd-party IP failover technology (bond, IPMP or similar) is supported natively by Grid Infrastructure starting from 11.2.0.2. The note is for the interconnect.
What about the Oracle VIP? We already binded the 2nd public interface of IPMP to the node-vip (nodeapps) in order to resolve the DB listener down issue when IPMP failover to 2nd interface , do we have to undo it before the 11.2.0.2 upgrade?h

Ans: – no need to undo ipmp what is already configured.

2. In the note Oracle provided: http://download.oracle.com/docs/cd/E11882_01/install.112/e17213/procstop.htm#CEGHBJDB, It says:
=========
– To upgrade existing 11.2.0.1 Oracle Grid Infrastructure installations to Oracle Grid Infrastructure 11.2.0.2, you must first do at least one of the following:

Patch the release 11.2.0.1 Oracle Grid Infrastructure home with the 9413827 and 9706490 patches.

Install Oracle Grid Infrastructure Patch Set 1 (GI PSU1) or Oracle Grid Infrastructure Patch Set 2 (GI PSU2).

– Oracle Clusterware and Oracle ASM upgrades are always out-of-place upgrades. With 11g release 2 (11.2), you cannot perform an in-place upgrade of Oracle Clusterware and Oracle ASM to existing homes.
==========

If the”Oracle Clusterware and Oracle ASM upgrades are always out-of-place upgrades for 11.2″, why still need to patch the existing 11.2.0.1 Grid CRS HOME as mentioned above? If it is out of place, then Grid will have a new CRS HOME.

Ans:- Yes. Still you need to apply the recommended patches before upgrade. The reason is due to known issues, it is recommended to apply the patch before upgrade.
Ofcourse the grid infra needs to be installed on new home. Please go thorough the Grid installation document for more detail.

1. Our Grid is 11.2.0.1.0, we only need to apply one patch 9413827?

Yes. Please apply the 9413827 only.

2. Do we need to apply 9413827 to the RDBMS Home for 2 HA DBs that are under CRS control right before we upgrade the Grid to 11.2.0.2?

Yes. Please apply it.

Please refer the below document to check RAC option is enabled or not.
How to check RAC Option is currently linked into the Oracle Binary (Doc ID 284785.1)

RAC one node is only supported with DB version of 11g under 11.2.0.2 Grid for Solaris 10, correct?

Yes. It is supported. Please find the certification metrics as below

OS Product Certified With Version Status
10 11gR2 64-bit Oracle Clusterware 11g Certified
10 11gR2 64-bit Oracle Solaris Cluster 3.3 Certified
10 11gR2 64-bit Oracle Solaris Cluster 3.2 Certified
10 11gR2 64-bit Veritas Storage Foundation for Oracle RAC 5.1 Certified
10 11gR2 64-bit Veritas Storage Foundation for Oracle RAC 5.0 Certified

We have some DBs with 10g version. Right now they are standalone DBs and can not be upgraded for now. Does Oracle support 10g database with RAC 2 nodes under 11.2.0.2 Grid on Solaris 10?

Yes. It is supported.

11.2.0.2补丁集安装体验

使用了Out-of-place Upgrade方式,安装图形界面沿袭了11.2.0.1的风格:
[Read more…]

Brain Split?

真正出现脑裂的几率并不高,但确实让我们碰上了。2个月前为一套AIX6.1上的10.2.0.4双节点RAC系统做故障测试,主要内容是拔除RAC interconnect网线,测试CRS能否正确处理私有网络挂掉的情况。

 

正式测试时发现2台主机都没有重启,而两端的CSS都认为对方节点已经down了。这就造成2个节点都以为自身是幸存者,也就是我们说的脑裂(brain split),此时实例一般会因为LMON进程的缘故而hang住。

 

我们来比对当时2个节点上的日志进一步分析:

 

STEP 1 :20:41:19物理拔出网线后,节点间无法正常通信,进入misscount倒计时600s
节点1:
[    CSSD]2010-06-22 20:41:21.465 [3342] >TRACE:   clssnmPollingThread: node gis2 (2) missed(2) checkin(s)
[    CSSD]2010-06-22 20:41:22.465 [3342] >TRACE:   clssnmPollingThread: node gis2 (2) missed(3) checkin(s)
.........
[    CSSD]2010-06-22 20:51:17.956 [3342] >TRACE:   clssnmPollingThread: node gis2 (2) missed(598) checkin(s)
[    CSSD]2010-06-22 20:51:18.963 [3342] >TRACE:   clssnmPollingThread: node gis2 (2) missed(599) checkin(s)
[    CSSD]2010-06-22 20:51:19.963 [3342] >TRACE:   clssnmPollingThread: Eviction started for node gis2 (2), flags 0x0001, state 3, wt4c 0

/* 节点1上完成倒计时后开始驱逐节点2*/

节点2:
[    CSSD]2010-06-22 20:41:19.598 [3342] >TRACE:   clssnmPollingThread: node gis1 (1) missed(2) checkin(s)
[    CSSD]2010-06-22 20:41:20.599 [3342] >TRACE:   clssnmPollingThread: node gis1 (1) missed(3) checkin(s)
......................
[    CSSD]2010-06-22 20:51:15.871 [3342] >TRACE:   clssnmPollingThread: node gis1 (1) missed(598) checkin(s)
[    CSSD]2010-06-22 20:51:16.871 [3342] >TRACE:   clssnmPollingThread: node gis1 (1) missed(599) checkin(s)
[    CSSD]2010-06-22 20:51:17.878 [3342] >TRACE:   clssnmPollingThread: Eviction started for node gis1 (1), flags 0x0001, state 3, wt4c 0

/*同样的节点2也是在10分钟后的51分开始驱逐节点1*/

STEP 2: 2个节点读取磁盘心跳信息(clssnmReadDskHeartbeat),且认为对方节点已经down了

节点1:
[    CSSD]2010-06-22 20:51:20.964 [3856] >TRACE:   clssnmSetupAckWait: node(1) is ACTIVE
[    CSSD]2010-06-22 20:51:20.964 [3856] >TRACE:   clssnmSendVote: syncSeqNo(3)
[    CSSD]2010-06-22 20:51:20.964 [3856] >TRACE:   clssnmWaitForAcks: Ack message type(13), ackCount(1)
[    CSSD]2010-06-22 20:51:20.965 [2057] >TRACE:   clssnmSendVoteInfo: node(1) syncSeqNo(3)
[    CSSD]2010-06-22 20:51:21.714 [1543] >TRACE:   clssnmReadDskHeartbeat: node(2) is down. rcfg(3) wrtcnt(4185) LATS(1628594178) Disk lastSeqNo(4185)
[    CSSD]2010-06-22 20:51:21.965 [3856] >TRACE:   clssnmWaitForAcks: done, msg type(13)
[    CSSD]2010-06-22 20:51:21.965 [3856] >TRACE:   clssnmCheckDskInfo: Checking disk info...
[    CSSD]2010-06-22 20:51:22.718 [1543] >TRACE:   clssnmReadDskHeartbeat: node(2) is down. rcfg(3) wrtcnt(4186) LATS(1628595183) Disk lastSeqNo(4186)
[    CSSD]2010-06-22 20:51:22.964 [3342] >TRACE:   clssnmPollingThread: node gis1 (1) missed(2) checkin(s)
[    CSSD]2010-06-22 20:51:23.722 [1543] >TRACE:   clssnmReadDskHeartbeat: node(2) is down. rcfg(3) wrtcnt(4187) LATS(1628596186) Disk lastSeqNo(4187)
[ CSSD]2010-06-22 20:51:24.724 [1543] >TRACE: clssnmReadDskHeartbeat: node(2) is down.
rcfg(3) wrtcnt(4188) LATS(1628597189) Disk lastSeqNo(4188)
.............................
[    CSSD]2010-06-22 20:59:49.953 [1543] >TRACE:   clssnmReadDskHeartbeat: node(2) is down. rcfg(3) wrtcnt(4692) LATS(1629102418) Disk lastSeqNo(4692)
[    CSSD]2010-06-22 20:59:50.057 [3085] >TRACE:   clssgmPeerDeactivate: node 2 (gis2), death 0, state 0x80000001 connstate 0xf
[    CSSD]2010-06-22 20:59:50.104 [1029] >TRACE:   clssnm_skgxncheck: CSS daemon failed on node 2
[    CSSD]2010-06-22 20:59:50.382 [2314] >TRACE:   clssgmClientConnectMsg: Connect from con(112a6c5b0) proc(112a5a190) pid() proto(10:2:1:1)
[    CSSD]2010-06-22 20:59:51.231 [3856] >TRACE:   clssnmEvict: Start
[    CSSD]2010-06-22 20:59:51.231 [3856] >TRACE:   clssnmEvict: Evicting node 2, birth 1, death 3, killme 1
[    CSSD]2010-06-22 20:59:51.232 [3856] >TRACE:   clssnmWaitOnEvictions: Start
[    CSSD]2010-06-22 20:59:51.232 [3856] >TRACE:   clssnmWaitOnEvictions: Node(0) down, LATS(0),timeout(1629103696)
[    CSSD]2010-06-22 20:59:51.232 [3856] >TRACE:   clssnmWaitOnEvictions: Node(2) down, LATS(1629102418),timeout(1278)
[    CSSD]2010-06-22 20:59:51.232 [3856] >TRACE:   clssnmSetupAckWait: Ack message type (15)
[    CSSD]2010-06-22 20:59:51.232 [3856] >TRACE:   clssnmSetupAckWait: node(1) is ACTIVE
[    CSSD]2010-06-22 20:59:51.232 [3856] >TRACE:   clssnmSendUpdate: syncSeqNo(3)
[    CSSD]2010-06-22 20:59:51.232 [3856] >TRACE:   clssnmWaitForAcks: Ack message type(15), ackCount(1)
[    CSSD]2010-06-22 20:59:51.232 [2057] >TRACE:   clssnmUpdateNodeState: node 0, state (0/0) unique (0/0) prevConuni(0) birth (0/0) (old/new)
[    CSSD]2010-06-2F1.232 [2057] >TRACE:   clssnmDeactivateNode: node 0 () left cluster

[    CSSD]2010-06-22 20:59:51.232 [2057] >TRACE:   clssnmUpdateNodeState: node 1, state (3/3) unique (1277207505/1277207505) prevConuni(0) birth (2/2) (old/new)
[    CSSD]2010-06-22 20:59:51.232 [2057] >TRACE:   clssnmUpdateNodeState: node 2, state (0/0) unique (1277206874/1277206874) prevConuni(1277206874) birth (1/0) (old/new)
[    CSSD]2010-06-22 20:59:51.232 [2057] >TRACE:   clssnmDeactivateNode: node 2 (gis2) left cluster

[    CSSD]2010-06-22 20:59:51.233 [2057] >USER:    clssnmHandleUpdate: SYNC(3) from node(1) completed
[    CSSD]2010-06-22 20:59:51.233 [2057] >USER:    clssnmHandleUpdate: NODE 1 (gis1) IS ACTIVE MEMBER OF CLUSTER
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmReconfigThread:  started for reconfig (3)
[    CSSD]2010-06-22 20:59:51.310 [4114] >USER:    NMEVENT_RECONFIG [00][00][00][02]
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock crs_version type 2
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock ORA_CLSRD_1_gisdb type 2
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock ORA_CLSRD_1_gisdb type 3
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock ORA_CLSRD_2_gisdb type 3
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupOrphanMembers: cleaning up remote mbr(0) grock(ORA_CLSRD_2_gisdb) birth(1/0)
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock DBGISDB type 2
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock DGGISDB type 2
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock DAALL_DB type 2
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock EVMDMAIN type 2
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock CRSDMAIN type 2
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock IGGISDBALL type 2
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock ocr_crs type 2
[    CSSD]2010-06-22 20:59:51.310 [4114] >TRACE:   clssgmCleanupGrocks: cleaning up grock ORA_CLSRCSN_SRV_gisdb1 type 3
[    CSSD]2010-06-22 20:59:51.311 [4114] >TRACE:   clssgmEstablishConnections: 1 nodes in cluster incarn 3
[    CSSD]2010-06-22 20:59:51.311 [3085] >TRACE:   clssgmPeerListener: connects done (1/1)
[    CSSD]2010-06-22 20:59:51.311 [4114] >TRACE:   clssgmEstablishMasterNode: MASTER for 3 is node(1) birth(2)
[    CSSD]2010-06-22 20:59:51.311 [4114] >TRACE:   clssgmChangeMasterNode: requeued 1 RPCs
[    CSSD]2010-06-22 20:59:51.311 [4114] >TRACE:   clssgmMasterCMSync: Synchronizing group/lock status
[    CSSD]2010-06-22 20:59:51.312 [4114] >TRACE:   clssgmMasterSendDBDone: group/lock status synchronization complete
[    CSSD]CLSS-3000: reconfiguration successful, incarnation 3 with 1 nodes

[    CSSD]CLSS-3001: local node number 1, master node number 1

/* 节点1在hearbeat 8分钟左右后认为CSS daemon failed on node 2,正式认为Node 2离开了集群,并完成了reconfiguration*/

节点2:
[    CSSD]2010-06-22 20:51:18.892 [3856] >TRACE:   clssnmSendVote: syncSeqNo(3)
[    CSSD]2010-06-22 20:51:18.892 [3856] >TRACE:   clssnmWaitForAcks: Ack message type(13), ackCount(1)
[    CSSD]2010-06-22 20:51:18.892 [2057] >TRACE:   clssnmSendVoteInfo: node(2) syncSeqNo(3)
[    CSSD]2010-06-22 20:51:19.287 [1543] >TRACE:   clssnmReadDskHeartbeat: node(1) is down. rcfg(3) wrtcnt(3548) LATS(351788040) Disk lastSeqNo(3548)
[    CSSD]2010-06-22 20:51:19.892 [3856] >TRACE:   clssnmWaitForAcks: done, msg type(13)
[    CSSD]2010-06-22 20:51:19.892 [3856] >TRACE:   clssnmCheckDskInfo: Checking disk info...
[ CSSD]2010-06-22 20:51:20.288 [1543] >TRACE: clssnmReadDskHeartbeat: node(1) is down. rcfg(3) wrtcnt(3549) LATS(351789041) Disk lastSeqNo(3549)
[    CSSD]2010-06-22 20:51:21.308 [1543] >TRACE:   clssnmReadDskHeartbeat: node(1) is down. rcfg(3) wrtcnt(3550) LATS(351790062) Disk lastSeqNo(3550)
...........................
[    CSSD]2010-06-22 20:59:46.122 [1543] >TRACE:   clssnmReadDskHeartbeat: node(1) is down. rcfg(3) wrtcnt(4051) LATS(352294875) Disk lastSeqNo(4051)
[    CSSD]2010-06-22 20:59:46.341 [2314] >TRACE:   clssgmClientConnectMsg: Connect from con(112947c70) proc(112946f90) pid() proto(10:2:1:1)
[    CSSD]2010-06-22 20:59:46.355 [2314] >WARNING: clssgmShutDown: Received explicit shutdown request from client.
[    CSSD]2010-06-22 20:59:46.355 [2314] >TRACE:   clssgmClientShutdown: Aborting client (112a50210) proc (112a4e3d0)
[    CSSD]2010-06-22 20:59:46.355 [2314] >TRACE:   clssgmClientShutdown: Aborting client (112a50cd0) proc (112a4e3d0)
[    CSSD]2010-06-22 20:59:46.355 [2314] >TRACE:   clssgmClientShutdown: Aborting client (112a536f0) proc (112a4e3d0)
[    CSSD]2010-06-22 20:59:46.355 [2314] >TRACE:   clssgmClientShutdown: Aborting client (112a4eb90) proc (112a4eef0)
[    CSSD]2010-06-22 20:59:46.355 [2314] >TRACE:   clssgmClientShutdown: Aborting client (112a69250) proc (112a67e10)
[    CSSD]2010-06-22 20:59:46.355 [2314] >TRACE:   clssgmClientShutdown: Aborting client (112946050) proc (112945e50)
[    CSSD]2010-06-22 20:59:46.355 [2314] >TRACE:   clssgmClientShutdown: waited 0 seconds on 6 IO capable clients
[    CSSD]2010-06-22 20:59:46.494 [2314] >WARNING: clssgmClientShutdown: graceful shutdown completed.
[    CSSD]2010-06-22 20:59:47.130 [1543] >TRACE:   clssnmReadDskHeartbeat: node(1) is down. rcfg(3) wrtcnt(4052) LATS(352295883) Disk lastSeqNo(4052)
[    CSSD]2010-06-22 21:34:40.167 >USER:    Oracle Database 10g CSS Release 10.2.0.1.0 Production Copyright 1996, 2004 Oracle.  All rights reserved.

/* node2 也正确进行了heartbeat,并认为node(1) is down,最后被手动关闭;之后还原了网络故障,在21:34时CSS重新启动*/

 

如果你仔细看以上日志的话,你大概会找出”Oracle Database 10g CSS Release 10.2.0.1.0″的记录;这套RAC不是10.2.0.4的吗,为什么CSS还是10.2.0.1版本的呢,事后调查才发觉是安装该套系统的施工方国内某X码工程师在给CRS打补丁的时候忘记运行最后的root102.sh脚本了,该脚本将更新OCR/Voting disk及实际的CRS binary文件等,如果补丁安装结束后没有运行该脚本则升级不会有任何效果,而只会更新oraInventory中的信息。

 

刚开始时哪位X码的工程师抵死不肯承认忘记了运行脚本,而实际上在AIX 6.1上打10.2.0.4 CRS的patch是需要为oracle用户赋特有的权限的,这一点不同于AIX 5.3上,即:

 

chuser capabilities=CAP_BYPASS_RAC_VMM,CAP_PROPAGATE,CAP_NUMA_ATTACH oracle
/*进一步检查*/
lsuser -f oracle | grep capabilities
        capabilities=CAP_BYPASS_RAC_VMM,CAP_PROPAGATE,CAP_NUMA_ATTACH

 

如果未对oracle用户赋以上权限则运行root102.sh脚本时将报错。另一个判断的标志是pre10204/pre10205目录,如果运行过root102.sh脚本的话$ORA_CRS_HOME/install目录下会多出一个形如pre$VERSION的目录,没有的话一般就是没有运行过脚本,当然也有可能是时候删除了(不建议删除)。

 

了解到以上信息后对此次脑裂的追根溯源就要简单的多了,版本10201上的CRS可以说Bug众多的,从10201-10204期间CRS加入了不少新的参数和机制,在MOS上搜索关键词”brain split CSS”可以找到以下案例:

Hdr: 8293652 10.2.0.3 PCW 10.2.0.3 OSD PRODID-5 PORTID-46
Abstract: CSS CANNOT HANDLE SPLIT-BRAIN AND DB INSTANCE RECEIVES ORA-29740
PROBLEM:
——–
config:
2-node RAC: Node1 (pdb01) and Node2 (pdb02)
There’s no time difference between two nodes.

pdb02 got ORA-29740 and terminated at “Tue Feb 17 12:13:06 2009”
ORA-29740 occured with reason 1.
After ORA-29740 happened, the instance won’t be able to start
until rebooting OS.
After rebooting OS, everything was fine and instances were up.

DIAGNOSTIC ANALYSIS:
——————–
clssnmReadDskHeartbeat: node(2) is down. rcfg(8) wrtcnt(2494425)
LATS(1205488794) Disk lastSeqNo(2494425)
nodes
clssgmMasterSendDBDone: group/lock status synchronization complete
nodes

WORKAROUND:
———–
none

RELATED BUGS:
————-

REPRODUCIBILITY:
—————-
once at ct’s env.

TEST CASE:
———-

STACK TRACE:
————

SUPPORTING INFORMATION:
———————–

24 HOUR CONTACT INFORMATION FOR P1 BUGS:
—————————————-

DIAL-IN INFORMATION:
——————–

IMPACT DATE:
————

Does ct apply any CRS(bundle) patch ?

When problem happen, cssd can’t connect each other via interconnect,
but both cssd can do heartbeat to voting disk.
However, both cssd consider that “I’m survivor”.
Looking into node 1 cssd.
* 12:02:30.856 – Initiated sync
[ CSSD]2009-02-17 12:02:30.856 [1262557536] >TRACE: clssnmDoSyncUpdate:
Initiating sync 7
[ CSSD]2009-02-17 12:02:30.856 [1262557536] >TRACE: clssnmDoSyncUpdate:
diskTimeout set to (57000)ms

* Checking voting disk, and find node2 is still voting and living.
[ CSSD]2009-02-17 12:02:30.874 [1262557536] >TRACE: clssnmCheckDskInfo:
Checking disk info…
[ CSSD]2009-02-17 12:02:30.874 [1262557536] >TRACE: clssnmCheckDskInfo:
node(2) timeout(20) state_network(5) state_disk(3) misstime(0)
[ CSSD]2009-02-17 12:02:31.878 [1262557536] >TRACE: clssnmCheckDskInfo:
node(2) disk HB found, network state 5, disk state(3) misstime(1010)

* Compared cluster size and confirmed it can survive.
[ CSSD]2009-02-17 12:02:34.885 [1262557536] >TRACE: clssnmCheckDskInfo:
node 2, iz-pdb02, state 5 with leader 2 has smaller cluster size 1;
my cluster size 1 with leader 1

* Then finished
[ CSSD]2009-02-17 12:02:34.886 [1262557536] >TRACE: clssnmDoSyncUpdate:
Sync Complete!
*** 03/08/09 11:23 pm ***
Looking into node 2 cssd log.

* 12:02:20.647 – initiated sync protocol
[ CSSD]2009-02-17 12:02:20.647 [1262557536] >TRACE: clssnmDoSyncUpdate:
Initiating sync 7
[ CSSD]2009-02-17 12:02:20.647 [1262557536] >TRACE: clssnmDoSyncUpdate:
diskTimeout set to (57000)ms

* Checking disk and find node1 does not do disk heartbeart for 59690 ms.
it would have waited for misscount and considered node 1 is dead
[ CSSD]2009-02-17 12:02:22.285 [1262557536] >TRACE: clssnmCheckDskInfo:
Checking disk info…
[ CSSD]2009-02-17 12:02:22.285 [1262557536] >TRACE: clssnmCheckDskInfo:
node(1) timeout(59690) state_network(5) state_disk(3) misstime(61000)

* node2 is the only active member of cluster, finished.
[ CSSD]2009-02-17 12:02:22.723 [1262557536] >TRACE: clssnmDoSyncUpdate:
Sync Complete!
*** 03/08/09 11:45 pm ***
So, strange point is, node 2 cssd says node 1 cssd didn’t do
disk heartbeat for 60 seconds.

Looking into node1 cssd log just before initiating sync. We see 87sec gap.
———————————–
[ CSSD]2009-02-17 12:00:19.354 [1199618400] >TRACE: clssgmClientConnectMsg:
Connect from con(0x784d80) proc(0x7749b0) pid(14746) proto(10:2:1:1)
[ CSSD]2009-02-17 12:00:33.338 [1199618400] >TRACE: clssgmClientConnectMsg:
Connect from con(0x7d8620) proc(0x75cfb0) pid() proto(10:2:1:1)
[ CSSD]2009-02-17 12:01:03.688 [1199618400] >TRACE: clssgmClientConnectMsg:
Connect from con(0x76a390) proc(0x75cfb0) pid(13634) proto(10:2:1:1)
[ CSSD]2009-02-17 12:02:30.855 [1168148832] >WARNING: clssnmDiskPMT:
sltscvtimewait timeout (69200)
[ CSSD]2009-02-17 12:02:30.855 [1189128544] >WARNING: clssnmeventhndlr:
Receive failure with node 2 (iz-pdb02), state 3, con(0x72b980),
probe((nil)), rc=10
[ CSSD]2009-02-17 12:02:30.855 [1189128544] >TRACE: clssnmDiscHelper:
iz-pdb02, node(2) connection failed, con (0x72b980), probe((nil))
[ CSSD]2009-02-17 12:02:30.856 [1262557536] >TRACE: clssnmDoSyncUpdate:
Initiating sync 7

As an interesting point, clssgmClientConnectMsg does not show message,
but nm polling thread/disk ping thread does not warn timeout.
(Usually it should write message first at 50% of timeout = 30 sec)

And “sltscvtimewait timeout (69200)” message means, DiskPingMonitor thread
does not run for a 69200 ms whereas it just wants to sleep 1 second.

These suggest, cssd does not scheduled about 70 seconds on node 1.
I don’t see any log from DiskPingThread, but I assume it is suspended
at some point also, and back to work after 70 seconds.
Please check OS message file to see any interesting error is recorded.
To prevent this issue, keep watching OS performance to see if any
extreme high load does not happen.

Recommended solution it to go to 10.2.0.4 and use oprocd so that
we can expect oprocd kill node1 in such case.

 

 

上述案例同样是在”cssd can’t connect each other via interconnect”的状况下出现了”I’m survivor”的脑裂问题,MOS的建议是升级到10204后oprocd进程可以阻止这样的惨剧发生。

 

该问题最后通过升级到10.2.0.5解决了,这个case告诉我们在中国的it大环境内,有时候我们不得不亲力亲为地关心每一个细节,就拿这次来说我一开始也没发现升级没完成的情况,后来还是同事提醒了我;因为这是一个非常低级错误,如果施工方的X码工程师仔仔细细地按照他们下发的文档按部就班亦或者能留意一下升级时的图形窗口中的说明的话,这个问题都不会发生!而实际上不仅仅是此套系统,连带着其他2套系统也是这位X码工程师安装升级的,这几套系统在之后的故障测试时都发现了同样的问题。

事实告诉我们,细节决定成败!

Upgrading to RAC 11g R2 What you should know

How to upgrade?
Recommended: “Swing Kit”
Set up a new cluster on separate hardware
Install Grid Infrastructure at your leisure
Avoid time pressure
Create a physical standby on the new cluster
At a pre-defined outage window, perform a switchover
Remember: you don’t need to upgrade the database at the same time as you upgrade to Grid Infrastructure
The trick is to keep other standby databases in sync

Other upgrade options
If you don’t have Swing Kit…
Upgrade the existing software stack
Grid Infrastructure is an out of place upgrade
New Oracle home
Pre 11.2 Clusterware home and ASM home made redundant after successful upgrade
Out of place upgrade should make downgrade easier
Have not tested downgrade personally
Usually only one in the audience did

[gview file=”https://www.askmac.cn/wp-content/uploads/2011/08/072811_96317_ppt.ppt”]

沪ICP备14014813号-2

沪公网安备 31010802001379号