话说被分配一个任务,把旧版文档目录迁移到新版。用到一些思路,记录一下。

旧版目录结构数据源在 MYSQL, 作为懒人,尽量回避 mysql connect, 就直接用 JS 搞吧。

打开旧版本的后台,看了一下dom 结构,把数据先抽出来。

2013-12-27_115253

由于页数不多,手动翻翻页面,把数据保存一下就好。

var topics = $('.article_list tbody tr td:nth-child(2) a');

var data = {};
for (var i = topics.length - 1; i >= 0; i--) {
    var link = topics[i].getAttribute('href').split('id=')[1];
    var text = topics[i].innerText;
    data[text] = link;
    console.log('分类名称:', text, '分类ID:', link);
};

console.log(JSON.stringify(data));

把上面的数据保存成一个大数组,接着去重,以及换成_id:value_的形式。

var data = {}
for (var i = conv.length - 1; i >= 0; i--) {
    for (var ii in conv[i]) {
        data[ii] = conv[i][ii]
    }
}

for (var ii in data) {
    data[data[ii]] = ii
    delete data[ii];
}

console.log(JSON.stringify(data));

得到数据如下:

{"3":" 开发手册","7":" 资费问题","8":"稳定性和数据安全","9":"备案域名和内容审核","10":"其他问 题","11":"技术问题","13":"API","15":"代码部署工具","19":"概况","20":"新手上路","21":"平台介绍","32":"StdLib源代码","37":"SAE图标及素材","39":"限制配额","40":"经典案例","41":" 视频教程","147":"公共资源","150":"讲座及资料","155":"资费业务说明","156":"云豆赠送规则","163":"重 要更新","164":"运行原理","166":"应用管理","177":"加入我们","178":"媒体报道","179":"新浪云计算介 绍","189":"支付操作手册","190":"应用商店1","191":"应 用介绍","192":"MySQL","193":"AppConfig","194":"Counter","195":"Cron","196":"DeferredJob","197":"FetchURL","198":"Image","199":"KVDB","200":"Mail","201":"Memcache","202":"Rank","203":"RDC","204":"Storage","205":"TaskQueue","206":"TmpFS","207":"Web服务器","208":"SAE与标 准LAMP差异","209":"服务总线","210":"XHProf","211":"代码部署问 题","212":"SVN使用教程","213":"Hello World","214":"创建部署应 用","216":" 开发实例","218":"Wrappers","219":"应用体 检","220":"禁用函数和类","224":"中文分词服 务","225":"全文检索服务","226":"地理信息服务","227":"misc","228":"微盘开放接 口","229":"苹果消息推送服务","230":"提交应用","231":"本地开发环 境","232":"应用移植指南","235":"移动云平台介 绍","237":"认证规则","238":"违规应用","239":"反馈奖励","245":"故障补偿规则","250":"用户协议","251":"移动云插件与扩展","253":"移动云平台入 门","259":"移动云模板介绍","260":"扩展 API","262":"移动云最新更新","264":"Socket","267":"服务商店","269":"服务商店介绍","271":"接入服务","272":"移动云介绍与公开课","279":"CDN","280":"AppCan中间件介绍","281":"数据下载中心","289":"快速指引","290":"SAE Python环境","291":"可用服务列表","292":"相关工具","293":"FAQ","301":"服务","304":"Python","306":"扩展服务","307":"Java平台概述","308":"快速入门","309":"运行时环 境","310":"搭建本地环境","311":"使用服务","312":"分布式session","313":"框架支持","314":"查看日志","315":"资源下 载","316":"File Wrap","317":"Eclipse插件","319":"Cron使 用","321":"安全相关","322":"应用仓库","324":"Java FAQ","325":"TmpFS服 务","326":"TaskQueue服务","327":"FetchURL服务","328":"Mail服 务","329":"Storage服务","330":"Memcache服务","331":"KVDB服 务","332":"MySQL服务","333":"PHP常见问题","335":"实名认 证","336":"开发者认证","337":"教育机构认 证","340":"认证服务流程","341":"取消认证规则","342":"新版本地开发环境","343":"MiniSAE","344":"手动安装 MiniSAE","345":"Storage服务说 明","346":"Storage大文件上传","347":"工作原理","348":"签名算法","349":"服务审核及发 布","350":"接 入流程","351":"接入实例","352":"服务接入协议","353":"SDK下载及调试","354":"XML文档定 义","355":"状态码和错误码定义","356":"API规范","357":"创建应用","358":"调试工具","359":"应用打包","360":"Tab应用模版","361":"位置服务模版","362":"微博 插件","363":"语音识别","364":"图片滤镜","365":"子浏览 器","366":"短信发送","367":"定时器","368":"顶栏通知","369":"Utils","370":"短信管 理","371":"屏幕朝向","372":"二维码","373":"API Stdlib源代码","374":"默认war包","375":"js/css 库","376":"分词服务","377":"Channel"}

新的规范中,新目录结构如下:

2013-12-27_120439

我们看到,四级目录,其中三级目录开始,容量是两位数,所以要考虑ID容纳空间的问题。

这里可以考虑半手动的方式,就是把目录按照结构输入到一个wordpress分类中,然后在模版中写一句话,把结构打出来。

<div id="tree">
    <?php wp_list_categories('title_li=0&orderby=id&show_count=0&depth=99&hide_empty=0'); ?>
</div>

外面添加一个容器,一会方便用js来搞成和上面一样的数据结构。 打开wordpress的页面,把这里的结构搞出来。

2013-12-27_120848

var data = {};
var base = function(n){
    var ret = 1;
    //因为子目录可能会有两位数的容量,所以设置100
    var offset = 100;
    for(var i=0;i<n;i++){
        ret =ret*offset;
    }
    return ret;
};

//一级目录,仅有3个。
$('#tree').children('li').each(function(k,v){
    data[(k+1)*base(4)] = $(v).children('a').text();
    //二级目录,仅有4个。
    var subTree = $(v).children('ul').children('li');
    subTree.each(function(m,n){
        data[(k+1)*base(4)+ (m+1)*base(3)] = $(n).children('a').text();
        //三级目录,
        var subTree2 = $(n).children('ul').children('li');
        subTree2.each(function(p,q){
            data[(k+1)*base(4)+ (m+1)*base(3) + (p+1)*base(2)] = $(q).children('a').text();
            //四级
            var subTree3 = $(q).children('ul').children('li');
            subTree3.each(function(x,y){
                data[(k+1)*base(4)+ (m+1)*base(3) + (p+1)*base(2)+ (x+1)] = $(y).children('a').text();

            });
        });
    });
});

console.log(  JSON.stringify(data)  );

因为只有四级,结构也比较死,直接写成固定的代码,拿出来数据就可以了,没必要过度浪费时间写递归。 我们继续得到新的数据。

{"100000000":"了解SAE","101000000":"平台介绍","101010000":"安全相关","101020000":"概况","102000000":"资费介绍","102010000":"资费业务说明","102020000":"云豆赠送规则","102030000":"支付操作手册","103000000":"平台规则","103010000":"认证规则","103010001":"实名认证","103010002":"开发者认证","103010003":"教育机构认证","103010004":"认证服务流程","103010005":"取消认证规则","103020000":"违规应用","103030000":"反馈奖励","103040000":"故障补偿规则","104000000":"资源","104010000":"教程","104010001":"视频教程","104010002":"SVN使用教程","104010003":"Hello World","104010004":"创建部署应用","104020000":"代码部署工具","104030000":"本地开发环境","104040000":"MiniSAE","104050000":"SAE图标及素材","105000000":"常见问题FAQ","105010000":"资费问题","105020000":"稳定性和数据安全","105030000":"备案域名和内容审核","105040000":"本地开发环境和MiniSAE","105050000":"技术问题","105060000":"应用打包","105070000":"其他问题","200000000":"开发者中心","201000000":"服务总线","202000000":"PHP","202010000":"开发手册","202010001":"API","202010002":"StdLib源代码","202010003":"公共资源","202010004":"扩展API","202020000":"Web服务器","202030000":"SAE与标准LAMP差异","202040000":"禁用函数和类","202050000":"PHP FAQ","203000000":"限制配额","204000000":"服务","204010000":"MySQL","204020000":"AppConfig","204030000":"Counter","204040000":"Cron","204050000":"DeferredJob","204060000":"FetchURL","204070000":"Image","204080000":"KVDB","204090000":"Mail","204100000":"Memcache","204110000":"Rank","204120000":"RDC","204130000":"Storage","204140000":"TaskQueue","204150000":"TmpFS","204160000":"XHProf","204170000":"Wrappers","204180000":"Socket","204190000":"CDN","204200000":"js/css库","204210000":"Channel","204220000":"数据下载中心","204230000":"应用防火墙","204240000":"应用体检","204250000":"扩展服务","204250001":"中文分词服务","204250002":"全文检索服务","204250003":"地理信息服务","204250004":"微盘开放接口","204250005":"苹果消息推送服务","204250006":"音频二维码服务","204250007":"SendCloud邮件服务","204250008":"Face++人脸 检测","204250009":"有道翻译","204250010":"科大讯飞语音云服务","205000000":"Java","205010000":"Java平台概述","205020000":"快速入门","205030000":"运行时环境","205040000":"搭建本地环境","205050000":"使用服务","205060000":"分布式session","205070000":"框架支持","205080000":"查看日志","205090000":"Eclipse插件","205100000":"默认war包","205110000":"资源下载","205120000":"Java FAQ","206000000":"Python","206010000":"快速入门","206020000":"SAE Python环境","206030000":"相关工具","206040000":"Python FAQ","207000000":"移动云","207010000":"移动云平台入门","207010001":"创建应用","207010002":"调试工具","207010003":"应用打包","207020000":"移动云插件与扩展","207020001":"微博插件","207020002":"语音识别","207020003":"图片滤镜","207020004":"子浏览器","207020005":"短信发送","207020006":"定时器","207020007":"顶栏通知","207020008":"Utils","207020009":"短信管理","207020010":"屏幕朝向","207020011":"二维码","207030000":"移动云模板介绍","207030001":"Tab应用模板","207030002":"位置服务模板","207040000":"AppCan中间件介绍","300000000":"关于新浪云计算","301000000":"新浪云计算介绍","302000000":"媒体报道","303000000":"加入我们"}

ID的长度为9位,如果以后要扩容也毫无压力,当然这里排除了以后目录存在上百个的情况,因为运营的童鞋和产品的童鞋不会允许文档过长的。

这里发现第一组数据中的value是包含空格的,编辑器或者脚本中搜索"\s",正则批量替换掉。 接着开始记录关联性数据了,这里比较笨,用循环的方式。

//省略将刚刚上面的数据分别声明为newData和oldData 的过程。
var tmpData = {};

for(var ii in newData){
    for(var xx in oldData){
        if(newData[ii]==oldData[xx]){
            tmpData[ii] = xx;
            delete newData[ii] && delete oldData[xx];
        }
    }
}

function echo(data){
    return JSON.stringify(data);
}

console.log(echo(tmpData), echo(newData), echo(oldData));

console里回车一下,应该一瞬间就跑出结果了,然后把没有对应关系的分类手动补全一下就完事了。

至此,记录写完,大概用到的方法有:

  1. 利用第三方库文件将数据从DOM中剥离,抽象成数组结构。
  2. 目录设计预留容纳空间。
  3. 递归匹配,使用序列化的对象格式来继续处理数据。

如果你有类似需求并且数据量大的话,上面的可以优化的事情有这些:

  1. 手动合并数据改成自动合并,数据存储local storage或者indexed db。
  2. 手动请求分页改成ajax请求并解析新document中的对应dom。
  3. 最后一段处理,写匹配规则。

–EOF–